Update to Linux 2.6.10.
40f56238bnvciAuyzAiMkdzGErYt1A linux-2.6.10-xen-sparse/arch/xen/i386/kernel/head.S
40f58a0d31M2EkuPbG94ns_nOi0PVA linux-2.6.10-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c
40faa751_zbZlAmLyQgCXdYekVFdWA linux-2.6.10-xen-sparse/arch/xen/i386/kernel/ioport.c
+41d00d82zN8IfLBRxc7G_i7lbwT3cQ linux-2.6.10-xen-sparse/arch/xen/i386/kernel/irq.c
40f56238ue3YRsK52HG7iccNzP1AwQ linux-2.6.10-xen-sparse/arch/xen/i386/kernel/ldt.c
4107adf1cNtsuOxOB4T6paAoY2R2PA linux-2.6.10-xen-sparse/arch/xen/i386/kernel/pci-dma.c
40f56238a8iOVDEoostsbun_sy2i4g linux-2.6.10-xen-sparse/arch/xen/i386/kernel/process.c
40f56238YQIJoYG2ehDGEcdTgLmGbg linux-2.6.10-xen-sparse/arch/xen/i386/kernel/setup.c
40f56238nWMQg7CKbyTy0KJNvCzbtg linux-2.6.10-xen-sparse/arch/xen/i386/kernel/signal.c
+41811cac4lkCB-fHir6CcxuEJ2pGsQ linux-2.6.10-xen-sparse/arch/xen/i386/kernel/smp.c
+41811ca9mbGpqBrZVrUGEiv8CTV3ng linux-2.6.10-xen-sparse/arch/xen/i386/kernel/smpboot.c
40f56238qVGkpO_ycnQA8k03kQzAgA linux-2.6.10-xen-sparse/arch/xen/i386/kernel/time.c
40f56238NzTgeO63RGoxHrW5NQeO3Q linux-2.6.10-xen-sparse/arch/xen/i386/kernel/timers/Makefile
40f56238BMqG5PuSHufpjbvp_helBw linux-2.6.10-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c
412dfae9eA3_6e6bCGUtg1mj8b56fQ linux-2.6.10-xen-sparse/arch/xen/kernel/gnttab.c
40f562392LBhwmOxVPsYdkYXMxI_ZQ linux-2.6.10-xen-sparse/arch/xen/kernel/reboot.c
414c113396tK1HTVeUalm3u-1DF16g linux-2.6.10-xen-sparse/arch/xen/kernel/skbuff.c
+418f90e4lGdeJK9rmbOB1kN-IKSjsQ linux-2.6.10-xen-sparse/arch/xen/kernel/smp.c
3f68905c5eiA-lBMQSvXLMWS1ikDEA linux-2.6.10-xen-sparse/arch/xen/kernel/xen_proc.c
41261688yS8eAyy-7kzG4KBs0xbYCA linux-2.6.10-xen-sparse/drivers/Makefile
4108f5c1WfTIrs0HZFeV39sttekCTw linux-2.6.10-xen-sparse/drivers/char/mem.c
40f56239-JNIaTzlviVJohVdoYOUpw linux-2.6.10-xen-sparse/drivers/xen/blkfront/blkfront.c
40f56239y9naBTXe40Pi2J_z3p-d1g linux-2.6.10-xen-sparse/drivers/xen/blkfront/block.h
40f56239BVfPsXBiWQitXgDRtOsiqg linux-2.6.10-xen-sparse/drivers/xen/blkfront/vbd.c
+41a226e0vjAcDXHOnXE5ummcdUD2mg linux-2.6.10-xen-sparse/drivers/xen/blktap/Makefile
+41a226e0VeZA1N8tbU6nvJ3OxUcJmw linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.c
+41a226e1k4J5VMLnrYXDWRqElS49YQ linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.h
+41a226e1-A_Hy7utS8vJKaXnH_tzfA linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c
+41a226e19NoUUTOvs7jumDMRYDIO4Q linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_datapath.c
+41a226e1MNSyWWK5dEVgvSQ5OW0fDA linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_userdev.c
40f56239fsLjvtD8YBRAWphps4FDjg linux-2.6.10-xen-sparse/drivers/xen/console/Makefile
3e5a4e651TH-SXHoufurnWjgl5bfOA linux-2.6.10-xen-sparse/drivers/xen/console/console.c
40f56239KYxO0YabhPzCTeUuln-lnA linux-2.6.10-xen-sparse/drivers/xen/evtchn/Makefile
40f5623aKXkBBxgpLx2NcvkncQ1Yyw linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/mach-xen/irq_vectors.h
40f5623aDMCsWOFO0jktZ4e8sjwvEg linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h
40f5623arsFXkGdPvIqvFi3yFXGR0Q linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_pre.h
+41811f07Iri9hrvs97t-baxmhOwWDQ linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/mach-xen/smpboot_hooks.h
4120f807GCO0uqsLqdZj9csxR1Wthw linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/mmu_context.h
40f5623adgjZq9nAgCt0IXdWl7udSA linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/page.h
40f5623a54NuG-7qHihGYmw4wWQnMA linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/param.h
412ea0afQL2CAI-f522TbLjLPMibPQ linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/ptrace.h
40f5623bzLvxr7WoJIxVf2OH4rCBJg linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/segment.h
40f5623bG_LzgG6-qwk292nTc5Wabw linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/setup.h
+4198c32a8NzmcKVOzKaEJfaQxxiA0A linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/spinlock.h
40f5623bgzm_9vwxpzJswlAxg298Gg linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/synch_bitops.h
40f5623bVdKP7Dt7qm8twu3NcnGNbA linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/system.h
40f5623bc8LKPRO09wY5dGDnY_YCpw linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/tlbflush.h
412f46c0LJuKAgSPGoC0Z1DEkLfuLA linux-2.6.10-xen-sparse/mm/memory.c
410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.10-xen-sparse/mm/page_alloc.c
41505c572m-s9ATiO1LiD1GPznTTIg linux-2.6.10-xen-sparse/net/core/skbuff.c
-41811cac4lkCB-fHir6CcxuEJ2pGsQ linux-2.6.9-xen-sparse/arch/xen/i386/kernel/smp.c
-41811ca9mbGpqBrZVrUGEiv8CTV3ng linux-2.6.9-xen-sparse/arch/xen/i386/kernel/smpboot.c
-418f90e4lGdeJK9rmbOB1kN-IKSjsQ linux-2.6.9-xen-sparse/arch/xen/kernel/smp.c
-41a226e0vjAcDXHOnXE5ummcdUD2mg linux-2.6.9-xen-sparse/drivers/xen/blktap/Makefile
-41a226e0VeZA1N8tbU6nvJ3OxUcJmw linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap.c
-41a226e1k4J5VMLnrYXDWRqElS49YQ linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap.h
-41a226e1-A_Hy7utS8vJKaXnH_tzfA linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c
-41a226e19NoUUTOvs7jumDMRYDIO4Q linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_datapath.c
-41a226e1MNSyWWK5dEVgvSQ5OW0fDA linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_userdev.c
-41a64cdeQ5SWVEVbSZ0K-IeHHhIJ_w linux-2.6.9-xen-sparse/include/asm-xen/asm-i386/hardirq.h
-41811f07Iri9hrvs97t-baxmhOwWDQ linux-2.6.9-xen-sparse/include/asm-xen/asm-i386/mach-xen/smpboot_hooks.h
-4198c32a8NzmcKVOzKaEJfaQxxiA0A linux-2.6.9-xen-sparse/include/asm-xen/asm-i386/spinlock.h
413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2.0-xen-sparse/Makefile
413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
413cb1e5kY_Zil7-b0kI6hvCIxBEYg netbsd-2.0-xen-sparse/nbconfig-xen
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.10-rc3-xen0
-# Sun Dec 26 10:34:29 2004
+# Linux kernel version: 2.6.10-xen0
+# Mon Dec 27 10:14:40 2004
#
CONFIG_XEN=y
CONFIG_ARCH_XEN=y
CONFIG_XEN_NETDEV_BACKEND=y
CONFIG_XEN_BLKDEV_FRONTEND=y
CONFIG_XEN_NETDEV_FRONTEND=y
-# CONFIG_XEN_BLKDEV_TAP is not set
# CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
+# CONFIG_XEN_BLKDEV_TAP is not set
CONFIG_XEN_WRITABLE_PAGETABLES=y
CONFIG_XEN_SCRUB_PAGES=y
CONFIG_HAVE_ARCH_DEV_ALLOC_SKB=y
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.10-rc3-xenU
-# Sun Dec 26 10:35:15 2004
+# Linux kernel version: 2.6.10-xenU
+# Mon Dec 27 10:15:03 2004
#
CONFIG_XEN=y
CONFIG_ARCH_XEN=y
# CONFIG_XEN_PRIVILEGED_GUEST is not set
# CONFIG_XEN_PHYSDEV_ACCESS is not set
# CONFIG_XEN_BLKDEV_BACKEND is not set
-# CONFIG_XEN_BLKDEV_TAP_BE is not set
# CONFIG_XEN_NETDEV_BACKEND is not set
CONFIG_XEN_BLKDEV_FRONTEND=y
CONFIG_XEN_NETDEV_FRONTEND=y
-# CONFIG_XEN_BLKDEV_TAP is not set
# CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
+# CONFIG_XEN_BLKDEV_TAP is not set
CONFIG_XEN_WRITABLE_PAGETABLES=y
CONFIG_XEN_SCRUB_PAGES=y
CONFIG_HAVE_ARCH_DEV_ALLOC_SKB=y
--- /dev/null
+/*
+ * linux/arch/i386/kernel/irq.c
+ *
+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86-specific interrupt
+ * entry, irq-stacks and irq statistics code. All the remaining
+ * irq logic is done by the generic kernel/irq/ code and
+ * by the x86-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#ifndef CONFIG_X86_LOCAL_APIC
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+ printk("unexpected IRQ trap at vector %02x\n", irq);
+}
+#endif
+
+#ifdef CONFIG_4KSTACKS
+/*
+ * per-CPU IRQ handling contexts (thread information and stack)
+ */
+union irq_ctx {
+ struct thread_info tinfo;
+ u32 stack[THREAD_SIZE/sizeof(u32)];
+};
+
+static union irq_ctx *hardirq_ctx[NR_CPUS];
+static union irq_ctx *softirq_ctx[NR_CPUS];
+#endif
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+fastcall unsigned int do_IRQ(struct pt_regs *regs)
+{
+ /* high bits used in ret_from_ code */
+ int irq = regs->orig_eax & __IRQ_MASK(HARDIRQ_BITS);
+#ifdef CONFIG_4KSTACKS
+ union irq_ctx *curctx, *irqctx;
+ u32 *isp;
+#endif
+
+ irq_enter();
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ /* Debugging check for stack overflow: is there less than 1KB free? */
+ {
+ long esp;
+
+ __asm__ __volatile__("andl %%esp,%0" :
+ "=r" (esp) : "0" (THREAD_SIZE - 1));
+ if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
+ printk("do_IRQ: stack overflow: %ld\n",
+ esp - sizeof(struct thread_info));
+ dump_stack();
+ }
+ }
+#endif
+
+#ifdef CONFIG_4KSTACKS
+
+ curctx = (union irq_ctx *) current_thread_info();
+ irqctx = hardirq_ctx[smp_processor_id()];
+
+ /*
+ * this is where we switch to the IRQ stack. However, if we are
+ * already using the IRQ stack (because we interrupted a hardirq
+ * handler) we can't do that and just have to keep using the
+ * current stack (which is the irq stack already after all)
+ */
+ if (curctx != irqctx) {
+ int arg1, arg2, ebx;
+
+ /* build the stack frame on the IRQ stack */
+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+ irqctx->tinfo.task = curctx->tinfo.task;
+ irqctx->tinfo.previous_esp = current_stack_pointer;
+
+ asm volatile(
+ " xchgl %%ebx,%%esp \n"
+ " call __do_IRQ \n"
+ " movl %%ebx,%%esp \n"
+ : "=a" (arg1), "=d" (arg2), "=b" (ebx)
+ : "0" (irq), "1" (regs), "2" (isp)
+ : "memory", "cc", "ecx"
+ );
+ } else
+#endif
+ __do_IRQ(irq, regs);
+
+ irq_exit();
+
+ return 1;
+}
+
+#ifdef CONFIG_4KSTACKS
+
+/*
+ * These should really be __section__(".bss.page_aligned") as well, but
+ * gcc's 3.0 and earlier don't handle that correctly.
+ */
+static char softirq_stack[NR_CPUS * THREAD_SIZE]
+ __attribute__((__aligned__(THREAD_SIZE)));
+
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
+ __attribute__((__aligned__(THREAD_SIZE)));
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+ union irq_ctx *irqctx;
+
+ if (hardirq_ctx[cpu])
+ return;
+
+ irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+ irqctx->tinfo.task = NULL;
+ irqctx->tinfo.exec_domain = NULL;
+ irqctx->tinfo.cpu = cpu;
+ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
+
+ hardirq_ctx[cpu] = irqctx;
+
+ irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
+ irqctx->tinfo.task = NULL;
+ irqctx->tinfo.exec_domain = NULL;
+ irqctx->tinfo.cpu = cpu;
+ irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET;
+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
+
+ softirq_ctx[cpu] = irqctx;
+
+ printk("CPU %u irqstacks, hard=%p soft=%p\n",
+ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+}
+
+extern asmlinkage void __do_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+ unsigned long flags;
+ struct thread_info *curctx;
+ union irq_ctx *irqctx;
+ u32 *isp;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+
+ if (local_softirq_pending()) {
+ curctx = current_thread_info();
+ irqctx = softirq_ctx[smp_processor_id()];
+ irqctx->tinfo.task = curctx->task;
+ irqctx->tinfo.previous_esp = current_stack_pointer;
+
+ /* build the stack frame on the softirq stack */
+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+
+ asm volatile(
+ " xchgl %%ebx,%%esp \n"
+ " call __do_softirq \n"
+ " movl %%ebx,%%esp \n"
+ : "=b"(isp)
+ : "0"(isp)
+ : "memory", "cc", "edx", "ecx", "eax"
+ );
+ }
+
+ local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(do_softirq);
+#endif
+
+/*
+ * Interrupt statistics:
+ */
+
+atomic_t irq_err_count;
+
+/*
+ * /proc/interrupts printing:
+ */
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+ int i = *(loff_t *) v, j;
+ struct irqaction * action;
+ unsigned long flags;
+
+ if (i == 0) {
+ seq_printf(p, " ");
+ for (j=0; j<NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "CPU%d ",j);
+ seq_putc(p, '\n');
+ }
+
+ if (i < NR_IRQS) {
+ spin_lock_irqsave(&irq_desc[i].lock, flags);
+ action = irq_desc[i].action;
+ if (!action)
+ goto skip;
+ seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+ seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+#endif
+ seq_printf(p, " %14s", irq_desc[i].handler->typename);
+ seq_printf(p, " %s", action->name);
+
+ for (action=action->next; action; action = action->next)
+ seq_printf(p, ", %s", action->name);
+
+ seq_putc(p, '\n');
+skip:
+ spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+ } else if (i == NR_IRQS) {
+ seq_printf(p, "NMI: ");
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ", nmi_count(j));
+ seq_putc(p, '\n');
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "LOC: ");
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ",
+ irq_stat[j].apic_timer_irqs);
+ seq_putc(p, '\n');
+#endif
+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+ }
+ return 0;
+}
--- /dev/null
+/*
+ * Intel SMP support routines.
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ * This code is released under the GNU General Public License version 2 or
+ * later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#if 0
+#include <mach_apic.h>
+#endif
+#include <asm-xen/evtchn.h>
+
+#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg)
+
+/*
+ * Some notes on x86 processor bugs affecting SMP operation:
+ *
+ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ * The Linux implications for SMP are handled as follows:
+ *
+ * Pentium III / [Xeon]
+ * None of the E1AP-E3AP errata are visible to the user.
+ *
+ * E1AP. see PII A1AP
+ * E2AP. see PII A2AP
+ * E3AP. see PII A3AP
+ *
+ * Pentium II / [Xeon]
+ * None of the A1AP-A3AP errata are visible to the user.
+ *
+ * A1AP. see PPro 1AP
+ * A2AP. see PPro 2AP
+ * A3AP. see PPro 7AP
+ *
+ * Pentium Pro
+ * None of 1AP-9AP errata are visible to the normal user,
+ * except occasional delivery of 'spurious interrupt' as trap #15.
+ * This is very rare and a non-problem.
+ *
+ * 1AP. Linux maps APIC as non-cacheable
+ * 2AP. worked around in hardware
+ * 3AP. fixed in C0 and above steppings microcode update.
+ * Linux does not use excessive STARTUP_IPIs.
+ * 4AP. worked around in hardware
+ * 5AP. symmetric IO mode (normal Linux operation) not affected.
+ * 'noapic' mode has vector 0xf filled out properly.
+ * 6AP. 'noapic' mode might be affected - fixed in later steppings
+ * 7AP. We do not assume writes to the LVT deassering IRQs
+ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
+ * 9AP. We do not use mixed mode
+ *
+ * Pentium
+ * There is a marginal case where REP MOVS on 100MHz SMP
+ * machines with B stepping processors can fail. XXX should provide
+ * an L1cache=Writethrough or L1cache=off option.
+ *
+ * B stepping CPUs may hang. There are hardware work arounds
+ * for this. We warn about it in case your board doesn't have the work
+ * arounds. Basically thats so I can tell anyone with a B stepping
+ * CPU and SMP problems "tough".
+ *
+ * Specific items [From Pentium Processor Specification Update]
+ *
+ * 1AP. Linux doesn't use remote read
+ * 2AP. Linux doesn't trust APIC errors
+ * 3AP. We work around this
+ * 4AP. Linux never generated 3 interrupts of the same priority
+ * to cause a lost local interrupt.
+ * 5AP. Remote read is never used
+ * 6AP. not affected - worked around in hardware
+ * 7AP. not affected - worked around in hardware
+ * 8AP. worked around in hardware - we get explicit CS errors if not
+ * 9AP. only 'noapic' mode affected. Might generate spurious
+ * interrupts, we log only the first one and count the
+ * rest silently.
+ * 10AP. not affected - worked around in hardware
+ * 11AP. Linux reads the APIC between writes to avoid this, as per
+ * the documentation. Make sure you preserve this as it affects
+ * the C stepping chips too.
+ * 12AP. not affected - worked around in hardware
+ * 13AP. not affected - worked around in hardware
+ * 14AP. we always deassert INIT during bootup
+ * 15AP. not affected - worked around in hardware
+ * 16AP. not affected - worked around in hardware
+ * 17AP. not affected - worked around in hardware
+ * 18AP. not affected - worked around in hardware
+ * 19AP. not affected - worked around in BIOS
+ *
+ * If this sounds worrying believe me these bugs are either ___RARE___,
+ * or are signal timing bugs worked around in hardware and there's
+ * about nothing of note with C stepping upwards.
+ */
+
+DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
+
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+
+static inline int __prepare_ICR (unsigned int shortcut, int vector)
+{
+ return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
+}
+
+static inline int __prepare_ICR2 (unsigned int mask)
+{
+ return SET_APIC_DEST_FIELD(mask);
+}
+
+DECLARE_PER_CPU(int, ipi_to_evtchn[NR_IPIS]);
+
+static inline void __send_IPI_one(unsigned int cpu, int vector)
+{
+ unsigned int evtchn;
+
+ evtchn = per_cpu(ipi_to_evtchn, cpu)[vector];
+ // printk("send_IPI_mask_bitmask cpu %d vector %d evtchn %d\n", cpu, vector, evtchn);
+ if (evtchn) {
+#if 0
+ shared_info_t *s = HYPERVISOR_shared_info;
+ while (synch_test_bit(evtchn, &s->evtchn_pending[0]) ||
+ synch_test_bit(evtchn, &s->evtchn_mask[0]))
+ ;
+#endif
+ notify_via_evtchn(evtchn);
+ } else
+ printk("send_IPI to unbound port %d/%d",
+ cpu, vector);
+}
+
+void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+ int cpu;
+
+ switch (shortcut) {
+ case APIC_DEST_SELF:
+ __send_IPI_one(smp_processor_id(), vector);
+ break;
+ case APIC_DEST_ALLBUT:
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ if (cpu == smp_processor_id())
+ continue;
+ if (cpu_isset(cpu, cpu_online_map)) {
+ __send_IPI_one(cpu, vector);
+ }
+ }
+ break;
+ default:
+ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
+ vector);
+ break;
+ }
+}
+
+void fastcall send_IPI_self(int vector)
+{
+ __send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
+/*
+ * This is only used on smaller machines.
+ */
+void send_IPI_mask_bitmask(cpumask_t mask, int vector)
+{
+ unsigned long flags;
+ unsigned int cpu;
+
+ local_irq_save(flags);
+
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ if (cpu_isset(cpu, mask)) {
+ __send_IPI_one(cpu, vector);
+ }
+ }
+
+ local_irq_restore(flags);
+}
+
+inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
+{
+
+ send_IPI_mask_bitmask(mask, vector);
+}
+
+#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
+
+/*
+ * Smarter SMP flushing macros.
+ * c/o Linus Torvalds.
+ *
+ * These mean you can really definitely utterly forget about
+ * writing to user space from interrupts. (Its not allowed anyway).
+ *
+ * Optimizations Manfred Spraul <manfred@colorfullife.com>
+ */
+
+static cpumask_t flush_cpumask;
+static struct mm_struct * flush_mm;
+static unsigned long flush_va;
+static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED;
+#define FLUSH_ALL 0xffffffff
+
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ *
+ * We need to reload %cr3 since the page tables may be going
+ * away from under us..
+ */
+static inline void leave_mm (unsigned long cpu)
+{
+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+ BUG();
+ cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+ load_cr3(swapper_pg_dir);
+}
+
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ * Stop ipi delivery for the old mm. This is not synchronized with
+ * the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ * for the wrong mm, and in the worst case we perform a superflous
+ * tlb flush.
+ * 1a2) set cpu_tlbstate to TLBSTATE_OK
+ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ * was in lazy tlb mode.
+ * 1a3) update cpu_tlbstate[].active_mm
+ * Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ * Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ * cpu_tlbstate[].active_mm is correct, cpu0 already handles
+ * flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ * Atomically set the bit [other cpus will start sending flush ipis],
+ * and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ * runs in kernel space, the cpu could load tlb entries for user space
+ * pages.
+ *
+ * The good news is that cpu_tlbstate is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ */
+
+irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
+ struct pt_regs *regs)
+{
+ unsigned long cpu;
+
+ cpu = get_cpu();
+
+ if (!cpu_isset(cpu, flush_cpumask))
+ goto out;
+ /*
+ * This was a BUG() but until someone can quote me the
+ * line from the intel manual that guarantees an IPI to
+ * multiple CPUs is retried _only_ on the erroring CPUs
+ * its staying as a return
+ *
+ * BUG();
+ */
+
+ if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+ if (flush_va == FLUSH_ALL)
+ local_flush_tlb();
+ else
+ __flush_tlb_one(flush_va);
+ } else
+ leave_mm(cpu);
+ }
+ smp_mb__before_clear_bit();
+ cpu_clear(cpu, flush_cpumask);
+ smp_mb__after_clear_bit();
+out:
+ put_cpu_no_resched();
+
+ return IRQ_HANDLED;
+}
+
+static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+ unsigned long va)
+{
+ cpumask_t tmp;
+ /*
+ * A couple of (to be removed) sanity checks:
+ *
+ * - we do not send IPIs to not-yet booted CPUs.
+ * - current CPU must not be in mask
+ * - mask must exist :)
+ */
+ BUG_ON(cpus_empty(cpumask));
+
+ cpus_and(tmp, cpumask, cpu_online_map);
+ BUG_ON(!cpus_equal(cpumask, tmp));
+ BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+ BUG_ON(!mm);
+
+ /*
+ * i'm not happy about this global shared spinlock in the
+ * MM hot path, but we'll see how contended it is.
+ * Temporarily this turns IRQs off, so that lockups are
+ * detected by the NMI watchdog.
+ */
+ spin_lock(&tlbstate_lock);
+
+ flush_mm = mm;
+ flush_va = va;
+#if NR_CPUS <= BITS_PER_LONG
+ atomic_set_mask(cpumask, &flush_cpumask);
+#else
+ {
+ int k;
+ unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
+ unsigned long *cpu_mask = (unsigned long *)&cpumask;
+ for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
+ atomic_set_mask(cpu_mask[k], &flush_mask[k]);
+ }
+#endif
+ /*
+ * We have to send the IPI only to
+ * CPUs affected.
+ */
+ send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+
+ while (!cpus_empty(flush_cpumask))
+ /* nothing. lockup detection does not belong here */
+ mb();
+
+ flush_mm = NULL;
+ flush_va = 0;
+ spin_unlock(&tlbstate_lock);
+}
+
+void flush_tlb_current_task(void)
+{
+ struct mm_struct *mm = current->mm;
+ cpumask_t cpu_mask;
+
+ preempt_disable();
+ cpu_mask = mm->cpu_vm_mask;
+ cpu_clear(smp_processor_id(), cpu_mask);
+
+ local_flush_tlb();
+ if (!cpus_empty(cpu_mask))
+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+ preempt_enable();
+}
+
+void flush_tlb_mm (struct mm_struct * mm)
+{
+ cpumask_t cpu_mask;
+
+ preempt_disable();
+ cpu_mask = mm->cpu_vm_mask;
+ cpu_clear(smp_processor_id(), cpu_mask);
+
+ if (current->active_mm == mm) {
+ if (current->mm)
+ local_flush_tlb();
+ else
+ leave_mm(smp_processor_id());
+ }
+ if (!cpus_empty(cpu_mask))
+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+
+ preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ cpumask_t cpu_mask;
+
+ preempt_disable();
+ cpu_mask = mm->cpu_vm_mask;
+ cpu_clear(smp_processor_id(), cpu_mask);
+
+ if (current->active_mm == mm) {
+ if(current->mm)
+ __flush_tlb_one(va);
+ else
+ leave_mm(smp_processor_id());
+ }
+
+ if (!cpus_empty(cpu_mask))
+ flush_tlb_others(cpu_mask, mm, va);
+
+ preempt_enable();
+}
+
+static void do_flush_tlb_all(void* info)
+{
+ unsigned long cpu = smp_processor_id();
+
+ __flush_tlb_all();
+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+ leave_mm(cpu);
+}
+
+void flush_tlb_all(void)
+{
+ on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+}
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+void smp_send_reschedule(int cpu)
+{
+ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
+
+struct call_data_struct {
+ void (*func) (void *info);
+ void *info;
+ atomic_t started;
+ atomic_t finished;
+ int wait;
+};
+
+static struct call_data_struct * call_data;
+
+/*
+ * this function sends a 'generic call function' IPI to all other CPUs
+ * in the system.
+ */
+
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+ int wait)
+/*
+ * [SUMMARY] Run a function on all other CPUs.
+ * <func> The function to run. This must be fast and non-blocking.
+ * <info> An arbitrary pointer to pass to the function.
+ * <nonatomic> currently unused.
+ * <wait> If true, wait (atomically) until function has completed on other CPUs.
+ * [RETURNS] 0 on success, else a negative status code. Does not return until
+ * remote CPUs are nearly ready to execute <<func>> or are or have executed.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+{
+ struct call_data_struct data;
+ int cpus = num_online_cpus()-1;
+
+ if (!cpus)
+ return 0;
+
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ spin_lock(&call_lock);
+ call_data = &data;
+ mb();
+
+ /* Send a message to all other CPUs and wait for them to respond */
+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus)
+ barrier();
+
+ if (wait)
+ while (atomic_read(&data.finished) != cpus)
+ barrier();
+ spin_unlock(&call_lock);
+
+ return 0;
+}
+
+static void stop_this_cpu (void * dummy)
+{
+ /*
+ * Remove this CPU:
+ */
+ cpu_clear(smp_processor_id(), cpu_online_map);
+ local_irq_disable();
+#if 1
+ xxprint("stop_this_cpu disable_local_APIC\n");
+#else
+ disable_local_APIC();
+#endif
+ if (cpu_data[smp_processor_id()].hlt_works_ok)
+ for(;;) __asm__("hlt");
+ for (;;);
+}
+
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+void smp_send_stop(void)
+{
+ smp_call_function(stop_this_cpu, NULL, 1, 0);
+
+ local_irq_disable();
+#if 1
+ xxprint("smp_send_stop disable_local_APIC\n");
+#else
+ disable_local_APIC();
+#endif
+ local_irq_enable();
+}
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
+ struct pt_regs *regs)
+{
+
+ return IRQ_HANDLED;
+}
+
+#include <linux/kallsyms.h>
+irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
+ struct pt_regs *regs)
+{
+ void (*func) (void *info) = call_data->func;
+ void *info = call_data->info;
+ int wait = call_data->wait;
+
+ /*
+ * Notify initiating CPU that I've grabbed the data and am
+ * about to execute the function
+ */
+ mb();
+ atomic_inc(&call_data->started);
+ /*
+ * At this point the info structure may be out of scope unless wait==1
+ */
+ irq_enter();
+ (*func)(info);
+ irq_exit();
+
+ if (wait) {
+ mb();
+ atomic_inc(&call_data->finished);
+ }
+
+ return IRQ_HANDLED;
+}
+
--- /dev/null
+/*
+ * x86 SMP booting functions
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ * Much of the core SMP work is based on previous work by Thomas Radke, to
+ * whom a great many thanks are extended.
+ *
+ * Thanks to Intel for making available several different Pentium,
+ * Pentium Pro and Pentium-II/Xeon MP machines.
+ * Original development of Linux SMP code supported by Caldera.
+ *
+ * This code is released under the GNU General Public License version 2 or
+ * later.
+ *
+ * Fixes
+ * Felix Koop : NR_CPUS used properly
+ * Jose Renau : Handle single CPU case.
+ * Alan Cox : By repeated request 8) - Total BogoMIPS report.
+ * Greg Wright : Fix for kernel stacks panic.
+ * Erich Boleyn : MP v1.4 and additional changes.
+ * Matthias Sattler : Changes for 2.1 kernel map.
+ * Michel Lespinasse : Changes for 2.1 kernel map.
+ * Michael Chastain : Change trampoline.S to gnu as.
+ * Alan Cox : Dumb bug: 'B' step PPro's are fine
+ * Ingo Molnar : Added APIC timers, based on code
+ * from Jose Renau
+ * Ingo Molnar : various cleanups and rewrites
+ * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs
+ * Martin J. Bligh : Added support for multi-quad systems
+ * Dave Jones : Report invalid combinations of Athlon CPUs.
+* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/smp_lock.h>
+#include <linux/irq.h>
+#include <linux/bootmem.h>
+
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/arch_hooks.h>
+
+#if 1
+#define Dprintk(args...)
+#else
+#include <mach_apic.h>
+#endif
+#include <mach_wakecpu.h>
+#include <smpboot_hooks.h>
+
+/* Set if we find a B stepping CPU */
+static int __initdata smp_b_stepping;
+
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
+
+/* bitmap of online cpus */
+cpumask_t cpu_online_map;
+
+static cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+static cpumask_t smp_commenced_mask;
+
+/* Per CPU bogomips and other parameters */
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+
+u8 x86_cpu_to_apicid[NR_CPUS] =
+ { [0 ... NR_CPUS-1] = 0xff };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
+
+/* Set when the idlers are all forked */
+int smp_threads_ready;
+
+#if 0
+/*
+ * Trampoline 80x86 program as an array.
+ */
+
+extern unsigned char trampoline_data [];
+extern unsigned char trampoline_end [];
+static unsigned char *trampoline_base;
+static int trampoline_exec;
+
+/*
+ * Currently trivial. Write the real->protected mode
+ * bootstrap into the page concerned. The caller
+ * has made sure it's suitably aligned.
+ */
+
+static unsigned long __init setup_trampoline(void)
+{
+ memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
+ return virt_to_phys(trampoline_base);
+}
+#endif
+
+/*
+ * We are called very early to get the low memory for the
+ * SMP bootup trampoline page.
+ */
+void __init smp_alloc_memory(void)
+{
+#if 1
+ int cpu;
+
+ for (cpu = 1; cpu < NR_CPUS; cpu++) {
+ cpu_gdt_descr[cpu].address = (unsigned long)
+ alloc_bootmem_low_pages(PAGE_SIZE);
+ /* XXX free unused pages later */
+ }
+#else
+ trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
+ /*
+ * Has to be in very low memory so we can execute
+ * real-mode AP code.
+ */
+ if (__pa(trampoline_base) >= 0x9F000)
+ BUG();
+ /*
+ * Make the SMP trampoline executable:
+ */
+ trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
+#endif
+}
+
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+
+static void __init smp_store_cpu_info(int id)
+{
+ struct cpuinfo_x86 *c = cpu_data + id;
+
+ *c = boot_cpu_data;
+ if (id!=0)
+ identify_cpu(c);
+ /*
+ * Mask B, Pentium, but not Pentium MMX
+ */
+ if (c->x86_vendor == X86_VENDOR_INTEL &&
+ c->x86 == 5 &&
+ c->x86_mask >= 1 && c->x86_mask <= 4 &&
+ c->x86_model <= 3)
+ /*
+ * Remember we have B step Pentia with bugs
+ */
+ smp_b_stepping = 1;
+
+ /*
+ * Certain Athlons might work (for various values of 'work') in SMP
+ * but they are not certified as MP capable.
+ */
+ if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
+
+ /* Athlon 660/661 is valid. */
+ if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
+ goto valid_k7;
+
+ /* Duron 670 is valid */
+ if ((c->x86_model==7) && (c->x86_mask==0))
+ goto valid_k7;
+
+ /*
+ * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
+ * It's worth noting that the A5 stepping (662) of some Athlon XP's
+ * have the MP bit set.
+ * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
+ */
+ if (((c->x86_model==6) && (c->x86_mask>=2)) ||
+ ((c->x86_model==7) && (c->x86_mask>=1)) ||
+ (c->x86_model> 7))
+ if (cpu_has_mp)
+ goto valid_k7;
+
+ /* If we get here, it's not a certified SMP capable AMD system. */
+ tainted |= TAINT_UNSAFE_SMP;
+ }
+
+valid_k7:
+ ;
+}
+
+#if 0
+/*
+ * TSC synchronization.
+ *
+ * We first check whether all CPUs have their TSC's synchronized,
+ * then we print a warning if not, and always resync.
+ */
+
+static atomic_t tsc_start_flag = ATOMIC_INIT(0);
+static atomic_t tsc_count_start = ATOMIC_INIT(0);
+static atomic_t tsc_count_stop = ATOMIC_INIT(0);
+static unsigned long long tsc_values[NR_CPUS];
+
+#define NR_LOOPS 5
+
+static void __init synchronize_tsc_bp (void)
+{
+ int i;
+ unsigned long long t0;
+ unsigned long long sum, avg;
+ long long delta;
+ unsigned long one_usec;
+ int buggy = 0;
+
+ printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
+
+ /* convert from kcyc/sec to cyc/usec */
+ one_usec = cpu_khz / 1000;
+
+ atomic_set(&tsc_start_flag, 1);
+ wmb();
+
+ /*
+ * We loop a few times to get a primed instruction cache,
+ * then the last pass is more or less synchronized and
+ * the BP and APs set their cycle counters to zero all at
+ * once. This reduces the chance of having random offsets
+ * between the processors, and guarantees that the maximum
+ * delay between the cycle counters is never bigger than
+ * the latency of information-passing (cachelines) between
+ * two CPUs.
+ */
+ for (i = 0; i < NR_LOOPS; i++) {
+ /*
+ * all APs synchronize but they loop on '== num_cpus'
+ */
+ while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
+ mb();
+ atomic_set(&tsc_count_stop, 0);
+ wmb();
+ /*
+ * this lets the APs save their current TSC:
+ */
+ atomic_inc(&tsc_count_start);
+
+ rdtscll(tsc_values[smp_processor_id()]);
+ /*
+ * We clear the TSC in the last loop:
+ */
+ if (i == NR_LOOPS-1)
+ write_tsc(0, 0);
+
+ /*
+ * Wait for all APs to leave the synchronization point:
+ */
+ while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
+ mb();
+ atomic_set(&tsc_count_start, 0);
+ wmb();
+ atomic_inc(&tsc_count_stop);
+ }
+
+ sum = 0;
+ for (i = 0; i < NR_CPUS; i++) {
+ if (cpu_isset(i, cpu_callout_map)) {
+ t0 = tsc_values[i];
+ sum += t0;
+ }
+ }
+ avg = sum;
+ do_div(avg, num_booting_cpus());
+
+ sum = 0;
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_isset(i, cpu_callout_map))
+ continue;
+ delta = tsc_values[i] - avg;
+ if (delta < 0)
+ delta = -delta;
+ /*
+ * We report bigger than 2 microseconds clock differences.
+ */
+ if (delta > 2*one_usec) {
+ long realdelta;
+ if (!buggy) {
+ buggy = 1;
+ printk("\n");
+ }
+ realdelta = delta;
+ do_div(realdelta, one_usec);
+ if (tsc_values[i] < avg)
+ realdelta = -realdelta;
+
+ printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta);
+ }
+
+ sum += delta;
+ }
+ if (!buggy)
+ printk("passed.\n");
+}
+
+static void __init synchronize_tsc_ap (void)
+{
+ int i;
+
+ /*
+ * Not every cpu is online at the time
+ * this gets called, so we first wait for the BP to
+ * finish SMP initialization:
+ */
+ while (!atomic_read(&tsc_start_flag)) mb();
+
+ for (i = 0; i < NR_LOOPS; i++) {
+ atomic_inc(&tsc_count_start);
+ while (atomic_read(&tsc_count_start) != num_booting_cpus())
+ mb();
+
+ rdtscll(tsc_values[smp_processor_id()]);
+ if (i == NR_LOOPS-1)
+ write_tsc(0, 0);
+
+ atomic_inc(&tsc_count_stop);
+ while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+ }
+}
+#undef NR_LOOPS
+#endif
+
+extern void calibrate_delay(void);
+
+static atomic_t init_deasserted;
+
+void __init smp_callin(void)
+{
+ int cpuid, phys_id;
+ unsigned long timeout;
+
+#if 0
+ /*
+ * If waken up by an INIT in an 82489DX configuration
+ * we may get here before an INIT-deassert IPI reaches
+ * our local APIC. We have to wait for the IPI or we'll
+ * lock up on an APIC access.
+ */
+ wait_for_init_deassert(&init_deasserted);
+#endif
+
+ /*
+ * (This works even if the APIC is not enabled.)
+ */
+ phys_id = smp_processor_id();
+ cpuid = smp_processor_id();
+ if (cpu_isset(cpuid, cpu_callin_map)) {
+ printk("huh, phys CPU#%d, CPU#%d already present??\n",
+ phys_id, cpuid);
+ BUG();
+ }
+ Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+
+ /*
+ * STARTUP IPIs are fragile beasts as they might sometimes
+ * trigger some glue motherboard logic. Complete APIC bus
+ * silence for 1 second, this overestimates the time the
+ * boot CPU is spending to send the up to 2 STARTUP IPIs
+ * by a factor of two. This should be enough.
+ */
+
+ /*
+ * Waiting 2s total for startup (udelay is not yet working)
+ */
+ timeout = jiffies + 2*HZ;
+ while (time_before(jiffies, timeout)) {
+ /*
+ * Has the boot CPU finished it's STARTUP sequence?
+ */
+ if (cpu_isset(cpuid, cpu_callout_map))
+ break;
+ rep_nop();
+ }
+
+ if (!time_before(jiffies, timeout)) {
+ printk("BUG: CPU%d started up but did not get a callout!\n",
+ cpuid);
+ BUG();
+ }
+
+#if 0
+ /*
+ * the boot CPU has finished the init stage and is spinning
+ * on callin_map until we finish. We are free to set up this
+ * CPU, first the APIC. (this is probably redundant on most
+ * boards)
+ */
+
+ Dprintk("CALLIN, before setup_local_APIC().\n");
+ smp_callin_clear_local_apic();
+ setup_local_APIC();
+#endif
+ map_cpu_to_logical_apicid();
+
+ local_irq_enable();
+
+ /*
+ * Get our bogomips.
+ */
+ calibrate_delay();
+ Dprintk("Stack at about %p\n",&cpuid);
+
+ /*
+ * Save our processor parameters
+ */
+ smp_store_cpu_info(cpuid);
+
+#if 0
+ disable_APIC_timer();
+#endif
+ local_irq_disable();
+ /*
+ * Allow the master to continue.
+ */
+ cpu_set(cpuid, cpu_callin_map);
+
+#if 0
+ /*
+ * Synchronize the TSC with the BP
+ */
+ if (cpu_has_tsc && cpu_khz)
+ synchronize_tsc_ap();
+#endif
+}
+
+int cpucount;
+
+extern int cpu_idle(void);
+
+
+static irqreturn_t local_debug_interrupt(int irq, void *dev_id,
+ struct pt_regs *regs)
+{
+
+ return IRQ_HANDLED;
+}
+
+static struct irqaction local_irq_debug = {
+ local_debug_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "ldebug",
+ NULL, NULL
+};
+
+void local_setup_debug(void)
+{
+ (void)setup_irq(bind_virq_to_irq(VIRQ_DEBUG), &local_irq_debug);
+}
+
+
+extern void local_setup_timer(void);
+
+/*
+ * Activate a secondary processor.
+ */
+int __init start_secondary(void *unused)
+{
+ /*
+ * Dont put anything before smp_callin(), SMP
+ * booting is too fragile that we want to limit the
+ * things done here to the most necessary things.
+ */
+ cpu_init();
+ smp_callin();
+ while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
+ rep_nop();
+ local_setup_timer();
+ local_setup_debug(); /* XXX */
+ smp_intr_init();
+ local_irq_enable();
+ /*
+ * low-memory mappings have been cleared, flush them from
+ * the local TLBs too.
+ */
+ local_flush_tlb();
+ cpu_set(smp_processor_id(), cpu_online_map);
+ wmb();
+ if (0) {
+ char *msg2 = "delay2\n";
+ int timeout;
+ for (timeout = 0; timeout < 50000; timeout++) {
+ udelay(1000);
+ if (timeout == 2000) {
+ (void)HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg2), msg2);
+ timeout = 0;
+ }
+ }
+ }
+ return cpu_idle();
+}
+
+/*
+ * Everything has been set up for the secondary
+ * CPUs - they just need to reload everything
+ * from the task structure
+ * This function must not return.
+ */
+void __init initialize_secondary(void)
+{
+ /*
+ * We don't actually need to load the full TSS,
+ * basically just the stack pointer and the eip.
+ */
+
+ asm volatile(
+ "movl %0,%%esp\n\t"
+ "jmp *%1"
+ :
+ :"r" (current->thread.esp),"r" (current->thread.eip));
+}
+
+extern struct {
+ void * esp;
+ unsigned short ss;
+} stack_start;
+
+#ifdef CONFIG_NUMA
+
+/* which logical CPUs are on which nodes */
+cpumask_t node_2_cpu_mask[MAX_NUMNODES] =
+ { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
+/* which node each logical CPU is on */
+int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 };
+EXPORT_SYMBOL(cpu_2_node);
+
+/* set up a mapping between cpu and node. */
+static inline void map_cpu_to_node(int cpu, int node)
+{
+ printk("Mapping cpu %d to node %d\n", cpu, node);
+ cpu_set(cpu, node_2_cpu_mask[node]);
+ cpu_2_node[cpu] = node;
+}
+
+/* undo a mapping between cpu and node. */
+static inline void unmap_cpu_to_node(int cpu)
+{
+ int node;
+
+ printk("Unmapping cpu %d from all nodes\n", cpu);
+ for (node = 0; node < MAX_NUMNODES; node ++)
+ cpu_clear(cpu, node_2_cpu_mask[node]);
+ cpu_2_node[cpu] = 0;
+}
+#else /* !CONFIG_NUMA */
+
+#define map_cpu_to_node(cpu, node) ({})
+#define unmap_cpu_to_node(cpu) ({})
+
+#endif /* CONFIG_NUMA */
+
+u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+void map_cpu_to_logical_apicid(void)
+{
+ int cpu = smp_processor_id();
+ int apicid = smp_processor_id();
+
+ cpu_2_logical_apicid[cpu] = apicid;
+ map_cpu_to_node(cpu, apicid_to_node(apicid));
+}
+
+void unmap_cpu_to_logical_apicid(int cpu)
+{
+ cpu_2_logical_apicid[cpu] = BAD_APICID;
+ unmap_cpu_to_node(cpu);
+}
+
+#if APIC_DEBUG
+static inline void __inquire_remote_apic(int apicid)
+{
+ int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+ char *names[] = { "ID", "VERSION", "SPIV" };
+ int timeout, status;
+
+ printk("Inquiring remote APIC #%d...\n", apicid);
+
+ for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
+ printk("... APIC #%d %s: ", apicid, names[i]);
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+
+ apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+ apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+
+ timeout = 0;
+ do {
+ udelay(100);
+ status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+ } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+
+ switch (status) {
+ case APIC_ICR_RR_VALID:
+ status = apic_read(APIC_RRR);
+ printk("%08x\n", status);
+ break;
+ default:
+ printk("failed\n");
+ }
+ }
+}
+#endif
+
+#if 0
+#ifdef WAKE_SECONDARY_VIA_NMI
+/*
+ * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
+ * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
+ * won't ... remember to clear down the APIC, etc later.
+ */
+static int __init
+wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+{
+ unsigned long send_status = 0, accept_status = 0;
+ int timeout, maxlvt;
+
+ /* Target chip */
+ apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
+
+ /* Boot on the stack */
+ /* Kick the second */
+ apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+
+ Dprintk("Waiting for send to finish...\n");
+ timeout = 0;
+ do {
+ Dprintk("+");
+ udelay(100);
+ send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+ } while (send_status && (timeout++ < 1000));
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
+ udelay(200);
+ /*
+ * Due to the Pentium erratum 3AP.
+ */
+ maxlvt = get_maxlvt();
+ if (maxlvt > 3) {
+ apic_read_around(APIC_SPIV);
+ apic_write(APIC_ESR, 0);
+ }
+ accept_status = (apic_read(APIC_ESR) & 0xEF);
+ Dprintk("NMI sent.\n");
+
+ if (send_status)
+ printk("APIC never delivered???\n");
+ if (accept_status)
+ printk("APIC delivery error (%lx).\n", accept_status);
+
+ return (send_status | accept_status);
+}
+#endif /* WAKE_SECONDARY_VIA_NMI */
+
+#ifdef WAKE_SECONDARY_VIA_INIT
+static int __init
+wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+{
+ unsigned long send_status = 0, accept_status = 0;
+ int maxlvt, timeout, num_starts, j;
+
+ /*
+ * Be paranoid about clearing APIC errors.
+ */
+ if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+ apic_read_around(APIC_SPIV);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
+
+ Dprintk("Asserting INIT.\n");
+
+ /*
+ * Turn INIT on target chip
+ */
+ apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+ /*
+ * Send IPI
+ */
+ apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
+ | APIC_DM_INIT);
+
+ Dprintk("Waiting for send to finish...\n");
+ timeout = 0;
+ do {
+ Dprintk("+");
+ udelay(100);
+ send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+ } while (send_status && (timeout++ < 1000));
+
+ mdelay(10);
+
+ Dprintk("Deasserting INIT.\n");
+
+ /* Target chip */
+ apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+ /* Send IPI */
+ apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+
+ Dprintk("Waiting for send to finish...\n");
+ timeout = 0;
+ do {
+ Dprintk("+");
+ udelay(100);
+ send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+ } while (send_status && (timeout++ < 1000));
+
+ atomic_set(&init_deasserted, 1);
+
+ /*
+ * Should we send STARTUP IPIs ?
+ *
+ * Determine this based on the APIC version.
+ * If we don't have an integrated APIC, don't send the STARTUP IPIs.
+ */
+ if (APIC_INTEGRATED(apic_version[phys_apicid]))
+ num_starts = 2;
+ else
+ num_starts = 0;
+
+ /*
+ * Run STARTUP IPI loop.
+ */
+ Dprintk("#startup loops: %d.\n", num_starts);
+
+ maxlvt = get_maxlvt();
+
+ for (j = 1; j <= num_starts; j++) {
+ Dprintk("Sending STARTUP #%d.\n",j);
+ apic_read_around(APIC_SPIV);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ Dprintk("After apic_write.\n");
+
+ /*
+ * STARTUP IPI
+ */
+
+ /* Target chip */
+ apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+ /* Boot on the stack */
+ /* Kick the second */
+ apic_write_around(APIC_ICR, APIC_DM_STARTUP
+ | (start_eip >> 12));
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
+ udelay(300);
+
+ Dprintk("Startup point 1.\n");
+
+ Dprintk("Waiting for send to finish...\n");
+ timeout = 0;
+ do {
+ Dprintk("+");
+ udelay(100);
+ send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+ } while (send_status && (timeout++ < 1000));
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
+ udelay(200);
+ /*
+ * Due to the Pentium erratum 3AP.
+ */
+ if (maxlvt > 3) {
+ apic_read_around(APIC_SPIV);
+ apic_write(APIC_ESR, 0);
+ }
+ accept_status = (apic_read(APIC_ESR) & 0xEF);
+ if (send_status || accept_status)
+ break;
+ }
+ Dprintk("After Startup.\n");
+
+ if (send_status)
+ printk("APIC never delivered???\n");
+ if (accept_status)
+ printk("APIC delivery error (%lx).\n", accept_status);
+
+ return (send_status | accept_status);
+}
+#endif /* WAKE_SECONDARY_VIA_INIT */
+#endif
+
+extern cpumask_t cpu_initialized;
+
+static int __init do_boot_cpu(int apicid)
+/*
+ * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
+ * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
+ * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
+ */
+{
+ struct task_struct *idle;
+ unsigned long boot_error;
+ int timeout, cpu;
+ unsigned long start_eip;
+#if 0
+ unsigned short nmi_high = 0, nmi_low = 0;
+#endif
+ full_execution_context_t ctxt;
+ extern void startup_32_smp(void);
+ extern void hypervisor_callback(void);
+ extern void failsafe_callback(void);
+ extern int smp_trap_init(trap_info_t *);
+ int i;
+
+ cpu = ++cpucount;
+ /*
+ * We can't use kernel_thread since we must avoid to
+ * reschedule the child.
+ */
+ idle = fork_idle(cpu);
+ if (IS_ERR(idle))
+ panic("failed fork for CPU %d", cpu);
+ idle->thread.eip = (unsigned long) start_secondary;
+ /* start_eip had better be page-aligned! */
+ start_eip = (unsigned long)startup_32_smp;
+
+ /* So we see what's up */
+ printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+ /* Stack for startup_32 can be just as for start_secondary onwards */
+ stack_start.esp = (void *) idle->thread.esp;
+
+ irq_ctx_init(cpu);
+
+ /*
+ * This grunge runs the startup process for
+ * the targeted processor.
+ */
+
+ atomic_set(&init_deasserted, 0);
+
+#if 1
+ if (cpu_gdt_descr[0].size > PAGE_SIZE)
+ BUG();
+ cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
+ memcpy((void *)cpu_gdt_descr[cpu].address,
+ (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size);
+ memset((char *)cpu_gdt_descr[cpu].address +
+ FIRST_RESERVED_GDT_ENTRY * 8, 0,
+ NR_RESERVED_GDT_ENTRIES * 8);
+
+ memset(&ctxt, 0, sizeof(ctxt));
+
+ ctxt.cpu_ctxt.ds = __USER_DS;
+ ctxt.cpu_ctxt.es = __USER_DS;
+ ctxt.cpu_ctxt.fs = 0;
+ ctxt.cpu_ctxt.gs = 0;
+ ctxt.cpu_ctxt.ss = __KERNEL_DS;
+ ctxt.cpu_ctxt.cs = __KERNEL_CS;
+ ctxt.cpu_ctxt.eip = start_eip;
+ ctxt.cpu_ctxt.esp = idle->thread.esp;
+ ctxt.cpu_ctxt.eflags = (1<<9) | (1<<2) | (idle->thread.io_pl<<12);
+
+ /* FPU is set up to default initial state. */
+ memset(ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
+
+ /* Virtual IDT is empty at start-of-day. */
+ for ( i = 0; i < 256; i++ )
+ {
+ ctxt.trap_ctxt[i].vector = i;
+ ctxt.trap_ctxt[i].cs = FLAT_GUESTOS_CS;
+ }
+ ctxt.fast_trap_idx = smp_trap_init(ctxt.trap_ctxt);
+
+ /* No LDT. */
+ ctxt.ldt_ents = 0;
+
+ {
+ unsigned long va;
+ int f;
+
+ for (va = cpu_gdt_descr[cpu].address, f = 0;
+ va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size;
+ va += PAGE_SIZE, f++) {
+ ctxt.gdt_frames[f] = virt_to_machine(va) >> PAGE_SHIFT;
+ make_page_readonly((void *)va);
+ }
+ ctxt.gdt_ents = cpu_gdt_descr[cpu].size / 8;
+ flush_page_update_queue();
+ }
+
+ /* Ring 1 stack is the initial stack. */
+ ctxt.guestos_ss = __KERNEL_DS;
+ ctxt.guestos_esp = idle->thread.esp;
+
+ /* Callback handlers. */
+ ctxt.event_callback_cs = __KERNEL_CS;
+ ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
+ ctxt.failsafe_callback_cs = __KERNEL_CS;
+ ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
+
+ ctxt.pt_base = (unsigned long)virt_to_machine(swapper_pg_dir);
+
+ boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt);
+
+ if (!boot_error) {
+ /*
+ * allow APs to start initializing.
+ */
+ Dprintk("Before Callout %d.\n", cpu);
+ cpu_set(cpu, cpu_callout_map);
+ Dprintk("After Callout %d.\n", cpu);
+
+ /*
+ * Wait 5s total for a response
+ */
+ for (timeout = 0; timeout < 50000; timeout++) {
+ if (cpu_isset(cpu, cpu_callin_map))
+ break; /* It has booted */
+ udelay(100);
+ }
+
+ if (cpu_isset(cpu, cpu_callin_map)) {
+ /* number CPUs logically, starting from 1 (BSP is 0) */
+ Dprintk("OK.\n");
+ printk("CPU%d: ", cpu);
+ print_cpu_info(&cpu_data[cpu]);
+ Dprintk("CPU has booted.\n");
+ } else {
+ boot_error= 1;
+ }
+ }
+ x86_cpu_to_apicid[cpu] = apicid;
+ if (boot_error) {
+ /* Try to put things back the way they were before ... */
+ unmap_cpu_to_logical_apicid(cpu);
+ cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
+ cpucount--;
+ }
+
+#else
+ Dprintk("Setting warm reset code and vector.\n");
+
+ store_NMI_vector(&nmi_high, &nmi_low);
+
+ smpboot_setup_warm_reset_vector(start_eip);
+
+ /*
+ * Starting actual IPI sequence...
+ */
+ boot_error = wakeup_secondary_cpu(apicid, start_eip);
+
+ if (!boot_error) {
+ /*
+ * allow APs to start initializing.
+ */
+ Dprintk("Before Callout %d.\n", cpu);
+ cpu_set(cpu, cpu_callout_map);
+ Dprintk("After Callout %d.\n", cpu);
+
+ /*
+ * Wait 5s total for a response
+ */
+ for (timeout = 0; timeout < 50000; timeout++) {
+ if (cpu_isset(cpu, cpu_callin_map))
+ break; /* It has booted */
+ udelay(100);
+ }
+
+ if (cpu_isset(cpu, cpu_callin_map)) {
+ /* number CPUs logically, starting from 1 (BSP is 0) */
+ Dprintk("OK.\n");
+ printk("CPU%d: ", cpu);
+ print_cpu_info(&cpu_data[cpu]);
+ Dprintk("CPU has booted.\n");
+ } else {
+ boot_error= 1;
+ if (*((volatile unsigned char *)trampoline_base)
+ == 0xA5)
+ /* trampoline started but...? */
+ printk("Stuck ??\n");
+ else
+ /* trampoline code not run */
+ printk("Not responding.\n");
+ inquire_remote_apic(apicid);
+ }
+ }
+ x86_cpu_to_apicid[cpu] = apicid;
+ if (boot_error) {
+ /* Try to put things back the way they were before ... */
+ unmap_cpu_to_logical_apicid(cpu);
+ cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
+ cpucount--;
+ }
+
+ /* mark "stuck" area as not stuck */
+ *((volatile unsigned long *)trampoline_base) = 0;
+#endif
+
+ return boot_error;
+}
+
+cycles_t cacheflush_time;
+unsigned long cache_decay_ticks;
+
+static void smp_tune_scheduling (void)
+{
+ unsigned long cachesize; /* kB */
+ unsigned long bandwidth = 350; /* MB/s */
+ /*
+ * Rough estimation for SMP scheduling, this is the number of
+ * cycles it takes for a fully memory-limited process to flush
+ * the SMP-local cache.
+ *
+ * (For a P5 this pretty much means we will choose another idle
+ * CPU almost always at wakeup time (this is due to the small
+ * L1 cache), on PIIs it's around 50-100 usecs, depending on
+ * the cache size)
+ */
+
+ if (!cpu_khz) {
+ /*
+ * this basically disables processor-affinity
+ * scheduling on SMP without a TSC.
+ */
+ cacheflush_time = 0;
+ return;
+ } else {
+ cachesize = boot_cpu_data.x86_cache_size;
+ if (cachesize == -1) {
+ cachesize = 16; /* Pentiums, 2x8kB cache */
+ bandwidth = 100;
+ }
+
+ cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
+ }
+
+ cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1;
+
+ printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
+ (long)cacheflush_time/(cpu_khz/1000),
+ ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
+ printk("task migration cache decay timeout: %ld msecs.\n",
+ cache_decay_ticks);
+}
+
+/*
+ * Cycle through the processors sending APIC IPIs to boot each.
+ */
+
+#if 0
+static int boot_cpu_logical_apicid;
+#endif
+/* Where the IO area was mapped on multiquad, always 0 otherwise */
+void *xquad_portio;
+
+cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
+
+static void __init smp_boot_cpus(unsigned int max_cpus)
+{
+ int cpu, kicked;
+ unsigned long bogosum = 0;
+#if 0
+ int apicid, bit;
+#endif
+
+ /*
+ * Setup boot CPU information
+ */
+ smp_store_cpu_info(0); /* Final full version of the data */
+ printk("CPU%d: ", 0);
+ print_cpu_info(&cpu_data[0]);
+
+#if 0
+ boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+ boot_cpu_logical_apicid = logical_smp_processor_id();
+ x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+#else
+ // boot_cpu_physical_apicid = 0;
+ // boot_cpu_logical_apicid = 0;
+ x86_cpu_to_apicid[0] = 0;
+#endif
+
+ current_thread_info()->cpu = 0;
+ smp_tune_scheduling();
+ cpus_clear(cpu_sibling_map[0]);
+ cpu_set(0, cpu_sibling_map[0]);
+
+ /*
+ * If we couldn't find an SMP configuration at boot time,
+ * get out of here now!
+ */
+ if (!smp_found_config /* && !acpi_lapic) */) {
+ printk(KERN_NOTICE "SMP motherboard not detected.\n");
+ smpboot_clear_io_apic_irqs();
+#if 0
+ phys_cpu_present_map = physid_mask_of_physid(0);
+ if (APIC_init_uniprocessor())
+ printk(KERN_NOTICE "Local APIC not detected."
+ " Using dummy APIC emulation.\n");
+#endif
+ map_cpu_to_logical_apicid();
+ return;
+ }
+
+#if 0
+ /*
+ * Should not be necessary because the MP table should list the boot
+ * CPU too, but we do it for the sake of robustness anyway.
+ * Makes no sense to do this check in clustered apic mode, so skip it
+ */
+ if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
+ printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
+ boot_cpu_physical_apicid);
+ physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+ }
+
+ /*
+ * If we couldn't find a local APIC, then get out of here now!
+ */
+ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
+ printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+ boot_cpu_physical_apicid);
+ printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
+ smpboot_clear_io_apic_irqs();
+ phys_cpu_present_map = physid_mask_of_physid(0);
+ return;
+ }
+
+ verify_local_APIC();
+#endif
+
+ /*
+ * If SMP should be disabled, then really disable it!
+ */
+ if (!max_cpus) {
+ HYPERVISOR_shared_info->n_vcpu = 1;
+ printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
+ smpboot_clear_io_apic_irqs();
+#if 0
+ phys_cpu_present_map = physid_mask_of_physid(0);
+#endif
+ return;
+ }
+
+ smp_intr_init();
+
+#if 0
+ connect_bsp_APIC();
+ setup_local_APIC();
+#endif
+ map_cpu_to_logical_apicid();
+#if 0
+
+
+ setup_portio_remap();
+
+ /*
+ * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
+ *
+ * In clustered apic mode, phys_cpu_present_map is a constructed thus:
+ * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the
+ * clustered apic ID.
+ */
+ Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
+#endif
+ Dprintk("CPU present map: %lx\n",
+ (1UL << HYPERVISOR_shared_info->n_vcpu) - 1);
+
+ kicked = 1;
+ for (cpu = 1; kicked < NR_CPUS &&
+ cpu < HYPERVISOR_shared_info->n_vcpu; cpu++) {
+ if (max_cpus <= cpucount+1)
+ continue;
+
+ if (do_boot_cpu(cpu))
+ printk("CPU #%d not responding - cannot use it.\n",
+ cpu);
+ else
+ ++kicked;
+ }
+
+#if 0
+ /*
+ * Cleanup possible dangling ends...
+ */
+ smpboot_restore_warm_reset_vector();
+#endif
+
+ /*
+ * Allow the user to impress friends.
+ */
+ Dprintk("Before bogomips.\n");
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (cpu_isset(cpu, cpu_callout_map))
+ bogosum += cpu_data[cpu].loops_per_jiffy;
+ printk(KERN_INFO
+ "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+ cpucount+1,
+ bogosum/(500000/HZ),
+ (bogosum/(5000/HZ))%100);
+
+ Dprintk("Before bogocount - setting activated=1.\n");
+
+ if (smp_b_stepping)
+ printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
+
+ /*
+ * Don't taint if we are running SMP kernel on a single non-MP
+ * approved Athlon
+ */
+ if (tainted & TAINT_UNSAFE_SMP) {
+ if (cpucount)
+ printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
+ else
+ tainted &= ~TAINT_UNSAFE_SMP;
+ }
+
+ Dprintk("Boot done.\n");
+
+ /*
+ * construct cpu_sibling_map[], so that we can tell sibling CPUs
+ * efficiently.
+ */
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ cpus_clear(cpu_sibling_map[cpu]);
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ int siblings = 0;
+ int i;
+ if (!cpu_isset(cpu, cpu_callout_map))
+ continue;
+
+ if (smp_num_siblings > 1) {
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_isset(i, cpu_callout_map))
+ continue;
+ if (phys_proc_id[cpu] == phys_proc_id[i]) {
+ siblings++;
+ cpu_set(i, cpu_sibling_map[cpu]);
+ }
+ }
+ } else {
+ siblings++;
+ cpu_set(cpu, cpu_sibling_map[cpu]);
+ }
+
+ if (siblings != smp_num_siblings)
+ printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
+ }
+
+#if 0
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ check_nmi_watchdog();
+
+ smpboot_setup_io_apic();
+
+ setup_boot_APIC_clock();
+
+ /*
+ * Synchronize the TSC with the AP
+ */
+ if (cpu_has_tsc && cpucount && cpu_khz)
+ synchronize_tsc_bp();
+#endif
+}
+
+/* These are wrappers to interface to the new boot process. Someone
+ who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+ smp_boot_cpus(max_cpus);
+}
+
+void __devinit smp_prepare_boot_cpu(void)
+{
+ cpu_set(smp_processor_id(), cpu_online_map);
+ cpu_set(smp_processor_id(), cpu_callout_map);
+}
+
+int __devinit __cpu_up(unsigned int cpu)
+{
+ /* This only works at boot for x86. See "rewrite" above. */
+ if (cpu_isset(cpu, smp_commenced_mask)) {
+ local_irq_enable();
+ return -ENOSYS;
+ }
+
+ /* In case one didn't come up */
+ if (!cpu_isset(cpu, cpu_callin_map)) {
+ local_irq_enable();
+ return -EIO;
+ }
+
+ local_irq_enable();
+ /* Unleash the CPU! */
+ cpu_set(cpu, smp_commenced_mask);
+ while (!cpu_isset(cpu, cpu_online_map))
+ mb();
+ return 0;
+}
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+#if 1
+#else
+#ifdef CONFIG_X86_IO_APIC
+ setup_ioapic_dest();
+#endif
+ zap_low_mappings();
+ /*
+ * Disable executability of the SMP trampoline:
+ */
+ set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+#endif
+}
+
+extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
+
+static struct irqaction reschedule_irq = {
+ smp_reschedule_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "reschedule",
+ NULL, NULL
+};
+
+extern irqreturn_t smp_invalidate_interrupt(int, void *, struct pt_regs *);
+
+static struct irqaction invalidate_irq = {
+ smp_invalidate_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "invalidate",
+ NULL, NULL
+};
+
+extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
+
+static struct irqaction call_function_irq = {
+ smp_call_function_interrupt, SA_INTERRUPT, CPU_MASK_NONE,
+ "call_function", NULL, NULL
+};
+
+void __init smp_intr_init(void)
+{
+
+ (void)setup_irq(
+ bind_ipi_on_cpu_to_irq(smp_processor_id(), RESCHEDULE_VECTOR),
+ &reschedule_irq);
+ (void)setup_irq(
+ bind_ipi_on_cpu_to_irq(smp_processor_id(), INVALIDATE_TLB_VECTOR),
+ &invalidate_irq);
+ (void)setup_irq(
+ bind_ipi_on_cpu_to_irq(smp_processor_id(), CALL_FUNCTION_VECTOR),
+ &call_function_irq);
+}
--- /dev/null
+/* Copyright (C) 2004, Christian Limpach */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/threads.h>
+
+unsigned int __initdata maxcpus = NR_CPUS;
+
+
+/*
+ * the frequency of the profiling timer can be changed
+ * by writing a multiplier value into /proc/profile.
+ */
+int setup_profiling_timer(unsigned int multiplier)
+{
+ printk("setup_profiling_timer\n");
+
+ return 0;
+}
--- /dev/null
+
+obj-y := blktap_userdev.o blktap_datapath.o blktap_controlmsg.o blktap.o
+
--- /dev/null
+/******************************************************************************
+ * blktap.c
+ *
+ * XenLinux virtual block-device tap.
+ *
+ * Copyright (c) 2004, Andrew Warfield
+ *
+ * Based on the original split block driver:
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ *
+ * Note that unlike the split block driver code, this driver has been developed
+ * strictly for Linux 2.6
+ */
+
+#include "blktap.h"
+
+int __init xlblk_init(void)
+{
+ ctrl_msg_t cmsg;
+ blkif_fe_driver_status_t fe_st;
+ blkif_be_driver_status_t be_st;
+
+ printk(KERN_INFO "Initialising Xen block tap device\n");
+
+ DPRINTK(" tap - Backend connection init:\n");
+
+
+ (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
+ CALLBACK_IN_BLOCKING_CONTEXT);
+
+ /* Send a driver-UP notification to the domain controller. */
+ cmsg.type = CMSG_BLKIF_FE;
+ cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS;
+ cmsg.length = sizeof(blkif_fe_driver_status_t);
+ fe_st.status = BLKIF_DRIVER_STATUS_UP;
+ memcpy(cmsg.msg, &fe_st, sizeof(fe_st));
+ ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+
+ DPRINTK(" tap - Frontend connection init:\n");
+
+ active_reqs_init();
+
+ ptfe_blkif.status = DISCONNECTED;
+
+ (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx,
+ CALLBACK_IN_BLOCKING_CONTEXT);
+
+ /* Send a driver-UP notification to the domain controller. */
+ cmsg.type = CMSG_BLKIF_BE;
+ cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS;
+ cmsg.length = sizeof(blkif_be_driver_status_t);
+ be_st.status = BLKIF_DRIVER_STATUS_UP;
+ memcpy(cmsg.msg, &be_st, sizeof(be_st));
+ ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+
+ DPRINTK(" tap - Userland channel init:\n");
+
+ blktap_init();
+
+ DPRINTK("Blkif tap device initialized.\n");
+
+ return 0;
+}
+
+void blkdev_suspend(void)
+{
+}
+
+void blkdev_resume(void)
+{
+ ctrl_msg_t cmsg;
+ blkif_fe_driver_status_t st;
+
+ /* Send a driver-UP notification to the domain controller. */
+ cmsg.type = CMSG_BLKIF_FE;
+ cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS;
+ cmsg.length = sizeof(blkif_fe_driver_status_t);
+ st.status = BLKIF_DRIVER_STATUS_UP;
+ memcpy(cmsg.msg, &st, sizeof(st));
+ ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
+
+
+__initcall(xlblk_init);
--- /dev/null
+/*
+ * blktap.h
+ *
+ * Interfaces for the Xen block tap driver.
+ *
+ * (c) 2004, Andrew Warfield, University of Cambridge
+ *
+ */
+
+#ifndef __BLKTAP_H__
+#define __BLKTAP_H__
+
+#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <asm-xen/ctrl_if.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/xen-public/io/blkif.h>
+
+/* -------[ debug / pretty printing ]--------------------------------- */
+
+#if 0
+#define ASSERT(_p) \
+ if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+ __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
+
+/* -------[ connection / request tracking ]--------------------------- */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define VMALLOC_VMADDR(x) ((unsigned long)(x))
+#endif
+
+extern spinlock_t blkif_io_lock;
+
+typedef struct blkif_st {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+ /* Physical parameters of the comms window. */
+ unsigned long shmem_frame;
+ unsigned int evtchn;
+ int irq;
+ /* Comms information. */
+ blkif_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
+ BLKIF_RING_IDX blk_req_cons; /* Request consumer. */
+ BLKIF_RING_IDX blk_resp_prod; /* Private version of resp. producer. */
+
+ enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+ /*
+ * DISCONNECT response is deferred until pending requests are ack'ed.
+ * We therefore need to store the id from the original request.
+ */ u8 disconnect_rspid;
+ struct blkif_st *hash_next;
+ struct list_head blkdev_list;
+ spinlock_t blk_ring_lock;
+ atomic_t refcnt;
+
+ struct work_struct work;
+} blkif_t;
+
+typedef struct {
+ blkif_t *blkif;
+ unsigned long id;
+ int nr_pages;
+ unsigned long mach_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ unsigned long virt_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ int next_free;
+} active_req_t;
+
+
+/* -------[ block ring structs ]-------------------------------------- */
+
+/* Types of ring. */
+#define BLKIF_REQ_RING_TYPE 1
+#define BLKIF_RSP_RING_TYPE 2
+
+/* generic ring struct. */
+typedef struct blkif_generic_ring_struct {
+ int type;
+} blkif_generic_ring_t;
+
+/* A requestor's view of a ring. */
+typedef struct blkif_req_ring_struct {
+
+ int type; /* Will be BLKIF_REQ_RING_TYPE */
+ BLKIF_RING_IDX req_prod; /* PRIVATE req_prod index */
+ BLKIF_RING_IDX rsp_cons; /* Response consumer index */
+ blkif_ring_t *ring; /* Pointer to shared ring struct */
+
+} blkif_req_ring_t;
+
+#define BLKIF_REQ_RING_INIT { BLKIF_REQ_RING_TYPE, 0, 0, 0 }
+
+/* A responder's view of a ring. */
+typedef struct blkif_rsp_ring_struct {
+
+ int type;
+ BLKIF_RING_IDX rsp_prod; /* PRIVATE rsp_prod index */
+ BLKIF_RING_IDX req_cons; /* Request consumer index */
+ blkif_ring_t *ring; /* Pointer to shared ring struct */
+
+} blkif_rsp_ring_t;
+
+#define BLKIF_RSP_RING_INIT = { BLKIF_RSP_RING_TYPE, 0, 0, 0 }
+
+#define RING(a) (blkif_generic_ring_t *)(a)
+
+inline int BLKTAP_RING_FULL(blkif_generic_ring_t *ring);
+
+
+/* -------[ interposition -> character device interface ]------------- */
+
+/* /dev/xen/blktap resides at device number major=10, minor=200 */
+#define BLKTAP_MINOR 202
+
+/* size of the extra VMA area to map in attached pages. */
+#define BLKTAP_VMA_PAGES BLKIF_RING_SIZE
+
+/* blktap IOCTLs: */
+#define BLKTAP_IOCTL_KICK_FE 1
+#define BLKTAP_IOCTL_KICK_BE 2
+#define BLKTAP_IOCTL_SETMODE 3
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
+#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
+#define BLKTAP_MODE_INTERCEPT_FE 0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE 0x00000002
+#define BLKTAP_MODE_COPY_FE 0x00000004
+#define BLKTAP_MODE_COPY_BE 0x00000008
+#define BLKTAP_MODE_COPY_FE_PAGES 0x00000010
+#define BLKTAP_MODE_COPY_BE_PAGES 0x00000020
+
+#define BLKTAP_MODE_INTERPOSE \
+ (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+#define BLKTAP_MODE_COPY_BOTH \
+ (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE)
+
+#define BLKTAP_MODE_COPY_BOTH_PAGES \
+ (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES)
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+ return (
+ ( arg == BLKTAP_MODE_PASSTHROUGH ) ||
+ ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+ ( arg == BLKTAP_MODE_INTERCEPT_BE ) ||
+ ( arg == BLKTAP_MODE_INTERPOSE ) ||
+ ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) ||
+ ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
+ ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
+ );
+}
+
+
+
+/* -------[ Mappings to User VMA ]------------------------------------ */
+#define MAX_PENDING_REQS 64
+#define BATCH_PER_DOMAIN 16
+extern struct vm_area_struct *blktap_vma;
+
+/* The following are from blkback.c and should probably be put in a
+ * header and included from there.
+ * The mmap area described here is where attached data pages eill be mapped.
+ */
+
+extern unsigned long mmap_vstart;
+#define MMAP_PAGES_PER_REQUEST \
+ (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
+#define MMAP_PAGES \
+ (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
+#define MMAP_VADDR(_req,_seg) \
+ (mmap_vstart + \
+ ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+ ((_seg) * PAGE_SIZE))
+
+/* immediately before the mmap area, we have a bunch of pages reserved
+ * for shared memory rings.
+ */
+
+#define RING_PAGES 128
+extern unsigned long rings_vstart;
+
+/* -------[ Here be globals ]----------------------------------------- */
+
+extern unsigned long blktap_mode;
+
+
+/* blkif struct, containing ring to FE domain */
+extern blkif_t ptfe_blkif;
+
+/* Connection to a single backend domain. */
+extern blkif_ring_t *blk_ptbe_ring; /* Ring from the PT to the BE dom */
+extern BLKIF_RING_IDX ptbe_resp_cons; /* Response consumer for comms ring. */
+extern BLKIF_RING_IDX ptbe_req_prod; /* Private request producer. */
+
+/* Rings up to user space. */
+extern blkif_req_ring_t fe_ring;// = BLKIF_REQ_RING_INIT;
+extern blkif_rsp_ring_t be_ring;// = BLKIF_RSP_RING_INIT;
+
+/* Event channel to backend domain. */
+extern unsigned int blkif_ptbe_evtchn;
+
+/* User ring status... this will soon vanish into a ring struct. */
+extern unsigned long blktap_ring_ok;
+
+/* -------[ ...and function prototypes. ]----------------------------- */
+
+/* init function for character device interface. */
+int blktap_init(void);
+
+/* interfaces to the char driver, passing messages to and from apps. */
+void blktap_kick_user(void);
+int blktap_write_to_ring(blkif_request_t *req);
+
+
+/* user ring access functions: */
+int blktap_write_fe_ring(blkif_request_t *req);
+int blktap_write_be_ring(blkif_response_t *rsp);
+int blktap_read_fe_ring(void);
+int blktap_read_be_ring(void);
+
+/* and the helpers they call: */
+inline int write_resp_to_fe_ring(blkif_response_t *rsp);
+inline void kick_fe_domain(void);
+
+inline int write_req_to_be_ring(blkif_request_t *req);
+inline void kick_be_domain(void);
+
+/* Interrupt handlers. */
+irqreturn_t blkif_ptbe_int(int irq, void *dev_id,
+ struct pt_regs *ptregs);
+irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs);
+
+/* Control message receiver. */
+extern void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id);
+
+#define __BLKINT_H__
+#endif
--- /dev/null
+/******************************************************************************
+ * blktap_controlmsg.c
+ *
+ * XenLinux virtual block-device tap.
+ * Control interfaces to the frontend and backend drivers.
+ *
+ * Copyright (c) 2004, Andrew Warfield
+ *
+ */
+
+#include "blktap.h"
+
+#define BLKIF_STATE_CLOSED 0
+#define BLKIF_STATE_DISCONNECTED 1
+#define BLKIF_STATE_CONNECTED 2
+
+static char *blkif_state_name[] = {
+ [BLKIF_STATE_CLOSED] = "closed",
+ [BLKIF_STATE_DISCONNECTED] = "disconnected",
+ [BLKIF_STATE_CONNECTED] = "connected",
+};
+
+static char * blkif_status_name[] = {
+ [BLKIF_INTERFACE_STATUS_CLOSED] = "closed",
+ [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
+ [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected",
+ [BLKIF_INTERFACE_STATUS_CHANGED] = "changed",
+};
+static unsigned int blkif_pt_state = BLKIF_STATE_CLOSED;
+static unsigned blkif_ptbe_irq;
+unsigned int blkif_ptbe_evtchn;
+
+/*-----[ Control Messages to/from Frontend VMs ]--------------------------*/
+
+
+void blkif_ptfe_create(blkif_be_create_t *create)
+{
+ blkif_t *blkif;
+ domid_t domid = create->domid;
+ unsigned int handle = create->blkif_handle;
+
+
+ /* May want to store info on the connecting domain here. */
+
+ DPRINTK("PT got BE_CREATE\n");
+ blkif = &ptfe_blkif; /* for convenience if the hash is readded later. */
+
+ /* blkif struct init code from blkback.c */
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+ blkif->handle = handle;
+ blkif->status = DISCONNECTED;
+ spin_lock_init(&blkif->blk_ring_lock);
+ atomic_set(&blkif->refcnt, 0);
+
+ create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+
+void blkif_ptfe_destroy(blkif_be_destroy_t *destroy)
+{
+ /* Clear anything that we initialized above. */
+
+ DPRINTK("PT got BE_DESTROY\n");
+ destroy->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_ptfe_connect(blkif_be_connect_t *connect)
+{
+ domid_t domid = connect->domid;
+ /*unsigned int handle = connect->blkif_handle;*/
+ unsigned int evtchn = connect->evtchn;
+ unsigned long shmem_frame = connect->shmem_frame;
+ struct vm_struct *vma;
+ pgprot_t prot;
+ int error;
+ blkif_t *blkif;
+
+ DPRINTK("PT got BE_CONNECT\n");
+
+ blkif = &ptfe_blkif; /* for convenience if the hash is readded later. */
+
+ if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
+ {
+ connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+ return;
+ }
+
+ prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+ error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
+ shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+ prot, domid);
+ if ( error != 0 )
+ {
+ WPRINTK("BE_CONNECT: error! (%d)\n", error);
+ if ( error == -ENOMEM )
+ connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+ else if ( error == -EFAULT ) {
+ connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
+ WPRINTK("BE_CONNECT: MAPPING error!\n");
+ }
+ else
+ connect->status = BLKIF_BE_STATUS_ERROR;
+ vfree(vma->addr);
+ return;
+ }
+
+ if ( blkif->status != DISCONNECTED )
+ {
+ connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+ vfree(vma->addr);
+ return;
+ }
+
+ blkif->evtchn = evtchn;
+ blkif->irq = bind_evtchn_to_irq(evtchn);
+ blkif->shmem_frame = shmem_frame;
+ blkif->blk_ring_base = (blkif_ring_t *)vma->addr;
+ blkif->status = CONNECTED;
+ /*blkif_get(blkif);*/
+
+ request_irq(blkif->irq, blkif_ptfe_int, 0, "blkif-pt-backend", blkif);
+
+ connect->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_ptfe_disconnect(blkif_be_disconnect_t *disconnect)
+{
+ /*
+ * don't actually set the passthrough to disconnected.
+ * We just act as a pipe, and defer to the real ends to handle things like
+ * recovery.
+ */
+
+ DPRINTK("PT got BE_DISCONNECT\n");
+
+ disconnect->status = BLKIF_BE_STATUS_OKAY;
+ return;
+}
+
+/*-----[ Control Messages to/from Backend VM ]----------------------------*/
+
+/* Tell the controller to bring up the interface. */
+static void blkif_ptbe_send_interface_connect(void)
+{
+ ctrl_msg_t cmsg = {
+ .type = CMSG_BLKIF_FE,
+ .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
+ .length = sizeof(blkif_fe_interface_connect_t),
+ };
+ blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
+ msg->handle = 0;
+ msg->shmem_frame = virt_to_machine(blk_ptbe_ring) >> PAGE_SHIFT;
+
+ ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
+
+static void blkif_ptbe_close(void)
+{
+}
+
+/* Move from CLOSED to DISCONNECTED state. */
+static void blkif_ptbe_disconnect(void)
+{
+ blk_ptbe_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
+ blk_ptbe_ring->req_prod = blk_ptbe_ring->resp_prod
+ = ptbe_resp_cons = ptbe_req_prod = 0;
+ blkif_pt_state = BLKIF_STATE_DISCONNECTED;
+ DPRINTK("Blkif-Passthrough-BE is now DISCONNECTED.\n");
+ blkif_ptbe_send_interface_connect();
+}
+
+static void blkif_ptbe_connect(blkif_fe_interface_status_t *status)
+{
+ int err = 0;
+
+ blkif_ptbe_evtchn = status->evtchn;
+ blkif_ptbe_irq = bind_evtchn_to_irq(blkif_ptbe_evtchn);
+
+ err = request_irq(blkif_ptbe_irq, blkif_ptbe_int,
+ SA_SAMPLE_RANDOM, "blkif", NULL);
+ if ( err ) {
+ WPRINTK("blkfront request_irq failed (%d)\n", err);
+ return;
+ } else {
+ /* transtion to connected in case we need to do a
+ a partion probe on a whole disk */
+ blkif_pt_state = BLKIF_STATE_CONNECTED;
+ }
+}
+
+static void unexpected(blkif_fe_interface_status_t *status)
+{
+ WPRINTK(" TAP: Unexpected blkif status %s in state %s\n",
+ blkif_status_name[status->status],
+ blkif_state_name[blkif_pt_state]);
+}
+
+static void blkif_ptbe_status(
+ blkif_fe_interface_status_t *status)
+{
+ if ( status->handle != 0 )
+ {
+ DPRINTK("Status change on unsupported blkif %d\n",
+ status->handle);
+ return;
+ }
+
+ DPRINTK("ptbe_status: got %s\n", blkif_status_name[status->status]);
+
+ switch ( status->status )
+ {
+ case BLKIF_INTERFACE_STATUS_CLOSED:
+ switch ( blkif_pt_state )
+ {
+ case BLKIF_STATE_CLOSED:
+ unexpected(status);
+ break;
+ case BLKIF_STATE_DISCONNECTED:
+ case BLKIF_STATE_CONNECTED:
+ unexpected(status);
+ blkif_ptbe_close();
+ break;
+ }
+ break;
+
+ case BLKIF_INTERFACE_STATUS_DISCONNECTED:
+ switch ( blkif_pt_state )
+ {
+ case BLKIF_STATE_CLOSED:
+ blkif_ptbe_disconnect();
+ break;
+ case BLKIF_STATE_DISCONNECTED:
+ case BLKIF_STATE_CONNECTED:
+ printk(KERN_ALERT "*** add recovery code to the tap driver. ***\n");
+ unexpected(status);
+ break;
+ }
+ break;
+
+ case BLKIF_INTERFACE_STATUS_CONNECTED:
+ switch ( blkif_pt_state )
+ {
+ case BLKIF_STATE_CLOSED:
+ unexpected(status);
+ blkif_ptbe_disconnect();
+ blkif_ptbe_connect(status);
+ break;
+ case BLKIF_STATE_DISCONNECTED:
+ blkif_ptbe_connect(status);
+ break;
+ case BLKIF_STATE_CONNECTED:
+ unexpected(status);
+ blkif_ptbe_connect(status);
+ break;
+ }
+ break;
+
+ case BLKIF_INTERFACE_STATUS_CHANGED:
+ switch ( blkif_pt_state )
+ {
+ case BLKIF_STATE_CLOSED:
+ case BLKIF_STATE_DISCONNECTED:
+ unexpected(status);
+ break;
+ case BLKIF_STATE_CONNECTED:
+ /* vbd_update(); */
+ /* tap doesn't really get state changes... */
+ unexpected(status);
+ break;
+ }
+ break;
+
+ default:
+ DPRINTK("Status change to unknown value %d\n", status->status);
+ break;
+ }
+}
+
+/*-----[ All control messages enter here: ]-------------------------------*/
+
+void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+ switch ( msg->type )
+ {
+ case CMSG_BLKIF_FE:
+
+ switch ( msg->subtype )
+ {
+ case CMSG_BLKIF_FE_INTERFACE_STATUS:
+ if ( msg->length != sizeof(blkif_fe_interface_status_t) )
+ goto parse_error;
+ blkif_ptbe_status((blkif_fe_interface_status_t *) &msg->msg[0]);
+ break;
+
+ default:
+ goto parse_error;
+ }
+
+ case CMSG_BLKIF_BE:
+
+ switch ( msg->subtype )
+ {
+ case CMSG_BLKIF_BE_CREATE:
+ if ( msg->length != sizeof(blkif_be_create_t) )
+ goto parse_error;
+ blkif_ptfe_create((blkif_be_create_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_DESTROY:
+ if ( msg->length != sizeof(blkif_be_destroy_t) )
+ goto parse_error;
+ blkif_ptfe_destroy((blkif_be_destroy_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_CONNECT:
+ if ( msg->length != sizeof(blkif_be_connect_t) )
+ goto parse_error;
+ blkif_ptfe_connect((blkif_be_connect_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_DISCONNECT:
+ if ( msg->length != sizeof(blkif_be_disconnect_t) )
+ goto parse_error;
+ blkif_ptfe_disconnect((blkif_be_disconnect_t *)&msg->msg[0]);
+ break;
+
+ /* We just ignore anything to do with vbds for now. */
+
+ case CMSG_BLKIF_BE_VBD_CREATE:
+ DPRINTK("PT got VBD_CREATE\n");
+ ((blkif_be_vbd_create_t *)&msg->msg[0])->status
+ = BLKIF_BE_STATUS_OKAY;
+ break;
+ case CMSG_BLKIF_BE_VBD_DESTROY:
+ DPRINTK("PT got VBD_DESTROY\n");
+ ((blkif_be_vbd_destroy_t *)&msg->msg[0])->status
+ = BLKIF_BE_STATUS_OKAY;
+ break;
+ case CMSG_BLKIF_BE_VBD_GROW:
+ DPRINTK("PT got VBD_GROW\n");
+ ((blkif_be_vbd_grow_t *)&msg->msg[0])->status
+ = BLKIF_BE_STATUS_OKAY;
+ break;
+ case CMSG_BLKIF_BE_VBD_SHRINK:
+ DPRINTK("PT got VBD_SHRINK\n");
+ ((blkif_be_vbd_shrink_t *)&msg->msg[0])->status
+ = BLKIF_BE_STATUS_OKAY;
+ break;
+ default:
+ goto parse_error;
+ }
+ }
+
+ ctrl_if_send_response(msg);
+ return;
+
+ parse_error:
+ msg->length = 0;
+ ctrl_if_send_response(msg);
+}
--- /dev/null
+/******************************************************************************
+ * blktap_datapath.c
+ *
+ * XenLinux virtual block-device tap.
+ * Block request routing data path.
+ *
+ * Copyright (c) 2004, Andrew Warfield
+ *
+ */
+
+#include "blktap.h"
+
+/*-----[ The data paths ]-------------------------------------------------*/
+
+/* Connections to the frontend domains.*/
+blkif_t ptfe_blkif;
+
+/* Connection to a single backend domain. */
+blkif_ring_t *blk_ptbe_ring; /* Ring from the PT to the BE dom */
+BLKIF_RING_IDX ptbe_resp_cons; /* Response consumer for comms ring. */
+BLKIF_RING_IDX ptbe_req_prod; /* Private request producer. */
+
+/* Rings up to user space. */
+blkif_req_ring_t fe_ring;// = BLKIF_REQ_RING_INIT;
+blkif_rsp_ring_t be_ring;// = BLKIF_RSP_RING_INIT;
+
+/*-----[ Ring helpers ]---------------------------------------------------*/
+
+inline int BLKTAP_RING_FULL(blkif_generic_ring_t *ring)
+{
+ if (ring->type == BLKIF_REQ_RING_TYPE) {
+ blkif_req_ring_t *r = (blkif_req_ring_t *)ring;
+ return ( ( r->req_prod - r->rsp_cons ) == BLKIF_RING_SIZE );
+ }
+
+ /* for now assume that there is always room in the response path. */
+ return 0;
+}
+
+/*-----[ Tracking active requests ]---------------------------------------*/
+
+/* this must be the same as MAX_PENDING_REQS in blkback.c */
+#define MAX_ACTIVE_REQS 64
+
+active_req_t active_reqs[MAX_ACTIVE_REQS];
+unsigned char active_req_ring[MAX_ACTIVE_REQS];
+spinlock_t active_req_lock = SPIN_LOCK_UNLOCKED;
+typedef unsigned int ACTIVE_RING_IDX;
+ACTIVE_RING_IDX active_prod, active_cons;
+#define MASK_ACTIVE_IDX(_i) ((_i)&(MAX_ACTIVE_REQS-1))
+#define ACTIVE_IDX(_ar) (_ar - active_reqs)
+
+inline active_req_t *get_active_req(void)
+{
+ ASSERT(active_cons != active_prod);
+ return &active_reqs[MASK_ACTIVE_IDX(active_cons++)];
+}
+
+inline void free_active_req(active_req_t *ar)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&active_req_lock, flags);
+ active_req_ring[MASK_ACTIVE_IDX(active_prod++)] = ACTIVE_IDX(ar);
+ spin_unlock_irqrestore(&active_req_lock, flags);
+}
+
+inline void active_reqs_init(void)
+{
+ ACTIVE_RING_IDX i;
+
+ active_cons = 0;
+ active_prod = MAX_ACTIVE_REQS;
+ memset(active_reqs, 0, sizeof(active_reqs));
+ for ( i = 0; i < MAX_ACTIVE_REQS; i++ )
+ active_req_ring[i] = i;
+}
+
+/*-----[ Data to/from Frontend (client) VMs ]-----------------------------*/
+
+irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+ /* we have pending messages from the real frontend. */
+
+ blkif_request_t *req_s, *req_d;
+ BLKIF_RING_IDX fe_rp;
+ unsigned long flags;
+ int notify;
+ unsigned long i;
+ active_req_t *ar;
+
+ DPRINTK("PT got FE interrupt.\n");
+
+ /* lock both rings */
+ spin_lock_irqsave(&blkif_io_lock, flags);
+
+ /* While there are REQUESTS on FERing: */
+ fe_rp = ptfe_blkif.blk_ring_base->req_prod;
+ rmb();
+ notify = (ptfe_blkif.blk_req_cons != fe_rp);
+
+ for (i = ptfe_blkif.blk_req_cons; i != fe_rp; i++) {
+
+ /* Get the next request */
+ req_s = &ptfe_blkif.blk_ring_base->ring[MASK_BLKIF_IDX(i)].req;
+
+ /* This is a new request:
+ * Assign an active request record, and remap the id.
+ */
+ ar = get_active_req();
+ ar->id = req_s->id;
+ req_s->id = ACTIVE_IDX(ar);
+ DPRINTK("%3lu < %3lu\n", req_s->id, ar->id);
+
+ /* FE -> BE interposition point is here. */
+
+ /* ------------------------------------------------------------- */
+ /* BLKIF_OP_PROBE_HACK: */
+ /* Until we have grant tables, we need to allow the backent to */
+ /* map pages that are either from this domain, or more commonly */
+ /* from the real front end. We achieve this in a terrible way, */
+ /* by passing the front end's domid allong with PROBE messages */
+ /* Once grant tables appear, this should all go away. */
+
+ if (req_s->operation == BLKIF_OP_PROBE) {
+ DPRINTK("Adding FE domid to PROBE request.\n");
+ (domid_t)(req_s->frame_and_sects[1]) = ptfe_blkif.domid;
+ }
+
+ /* ------------------------------------------------------------- */
+
+ /* If we are in MODE_INTERCEPT_FE or MODE_COPY_FE: */
+ if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
+ (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
+
+ /* Copy the response message to UFERing */
+ /* In MODE_INTERCEPT_FE, map attached pages into the app vma */
+ /* In MODE_COPY_FE_PAGES, copy attached pages into the app vma */
+
+ /* XXX: mapping/copying of attached pages is still not done! */
+
+ DPRINTK("req->UFERing\n");
+ blktap_write_fe_ring(req_s);
+
+
+ }
+
+ /* If we are not in MODE_INTERCEPT_FE or MODE_INTERCEPT_BE: */
+ if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
+ (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) {
+
+ /* be included to prevent noise from the fe when its off */
+ /* copy the request message to the BERing */
+
+ DPRINTK("blktap: FERing[%u] -> BERing[%u]\n",
+ (unsigned)MASK_BLKIF_IDX(i),
+ (unsigned)MASK_BLKIF_IDX(ptbe_req_prod));
+
+ req_d = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(ptbe_req_prod)].req;
+
+ memcpy(req_d, req_s, sizeof(blkif_request_t));
+
+ ptbe_req_prod++;
+ }
+ }
+
+ ptfe_blkif.blk_req_cons = i;
+
+ /* If we have forwarded any responses, notify the appropriate ends. */
+ if (notify) {
+
+ /* we have sent stuff to the be, notify it. */
+ if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
+ (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) {
+ wmb();
+ blk_ptbe_ring->req_prod = ptbe_req_prod;
+
+ notify_via_evtchn(blkif_ptbe_evtchn);
+ DPRINTK(" -- and notified.\n");
+ }
+
+ /* we sent stuff to the app, notify it. */
+ if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
+ (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
+
+ blktap_kick_user();
+ }
+ }
+
+ /* unlock rings */
+ spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+ return IRQ_HANDLED;
+}
+
+inline int write_req_to_be_ring(blkif_request_t *req)
+{
+ blkif_request_t *req_d;
+
+ req_d = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(ptbe_req_prod)].req;
+ memcpy(req_d, req, sizeof(blkif_request_t));
+ ptbe_req_prod++;
+
+ return 0;
+}
+
+inline void kick_be_domain(void) {
+ wmb();
+ blk_ptbe_ring->req_prod = ptbe_req_prod;
+ notify_via_evtchn(blkif_ptbe_evtchn);
+}
+
+/*-----[ Data to/from Backend (server) VM ]------------------------------*/
+
+
+irqreturn_t blkif_ptbe_int(int irq, void *dev_id,
+ struct pt_regs *ptregs)
+{
+ blkif_response_t *resp_s, *resp_d;
+ BLKIF_RING_IDX be_rp;
+ unsigned long flags;
+ int notify;
+ unsigned long i;
+ active_req_t *ar;
+
+ DPRINTK("PT got BE interrupt.\n");
+
+ /* lock both rings */
+ spin_lock_irqsave(&blkif_io_lock, flags);
+
+ /* While there are RESPONSES on BERing: */
+ be_rp = blk_ptbe_ring->resp_prod;
+ rmb();
+ notify = (ptbe_resp_cons != be_rp);
+
+ for ( i = ptbe_resp_cons; i != be_rp; i++ )
+ {
+ /* BE -> FE interposition point is here. */
+
+ /* Get the next response */
+ resp_s = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(i)].resp;
+
+
+ /* If we are in MODE_INTERCEPT_BE or MODE_COPY_BE: */
+ if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
+ (blktap_mode & BLKTAP_MODE_COPY_BE) ) {
+
+ /* Copy the response message to UBERing */
+ /* In MODE_INTERCEPT_BE, map attached pages into the app vma */
+ /* In MODE_COPY_BE_PAGES, copy attached pages into the app vma */
+
+ /* XXX: copy/map the attached page! */
+
+ DPRINTK("rsp->UBERing\n");
+ blktap_write_be_ring(resp_s);
+
+ }
+
+ /* If we are NOT in MODE_INTERCEPT_BE or MODE_INTERCEPT_FE: */
+ if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
+ (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) {
+
+ /* (fe included to prevent random interference from the BE) */
+ /* Copy the response message to FERing */
+
+ DPRINTK("blktap: BERing[%u] -> FERing[%u]\n",
+ (unsigned) MASK_BLKIF_IDX(i),
+ (unsigned) MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod));
+
+ /* remap id, and free the active req. blkif lookup goes here too.*/
+ ar = &active_reqs[resp_s->id];
+ DPRINTK("%3lu > %3lu\n", resp_s->id, ar->id);
+ resp_s->id = ar->id;
+ free_active_req(ar);
+
+ resp_d = &ptfe_blkif.blk_ring_base->ring[
+ MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod)].resp;
+
+ memcpy(resp_d, resp_s, sizeof(blkif_response_t));
+
+ ptfe_blkif.blk_resp_prod++;
+
+ }
+ }
+
+ ptbe_resp_cons = i;
+
+ /* If we have forwarded any responses, notify the apropriate domains. */
+ if (notify) {
+
+ /* we have sent stuff to the fe. notify it. */
+ if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
+ (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) {
+ wmb();
+ ptfe_blkif.blk_ring_base->resp_prod = ptfe_blkif.blk_resp_prod;
+
+ notify_via_evtchn(ptfe_blkif.evtchn);
+ DPRINTK(" -- and notified.\n");
+ }
+
+ /* we sent stuff to the app, notify it. */
+ if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
+ (blktap_mode & BLKTAP_MODE_COPY_BE) ) {
+
+ blktap_kick_user();
+ }
+ }
+
+ spin_unlock_irqrestore(&blkif_io_lock, flags);
+ return IRQ_HANDLED;
+}
+
+inline int write_resp_to_fe_ring(blkif_response_t *rsp)
+{
+ blkif_response_t *resp_d;
+ active_req_t *ar;
+
+ /* remap id, and free the active req. blkif lookup goes here too.*/
+ ar = &active_reqs[rsp->id];
+ DPRINTK("%3lu > %3lu\n", rsp->id, ar->id);
+ rsp->id = ar->id;
+ free_active_req(ar);
+
+ resp_d = &ptfe_blkif.blk_ring_base->ring[
+ MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod)].resp;
+
+ memcpy(resp_d, rsp, sizeof(blkif_response_t));
+ ptfe_blkif.blk_resp_prod++;
+
+ return 0;
+}
+
+inline void kick_fe_domain(void) {
+ wmb();
+ ptfe_blkif.blk_ring_base->resp_prod = ptfe_blkif.blk_resp_prod;
+ notify_via_evtchn(ptfe_blkif.evtchn);
+
+}
+
+static inline void flush_requests(void)
+{
+ wmb(); /* Ensure that the frontend can see the requests. */
+ blk_ptbe_ring->req_prod = ptbe_req_prod;
+ notify_via_evtchn(blkif_ptbe_evtchn);
+}
+
+/*-----[ Data to/from user space ]----------------------------------------*/
+
+
+int blktap_write_fe_ring(blkif_request_t *req)
+{
+ blkif_request_t *target;
+ int error, i;
+
+ /*
+ * This is called to pass a request from the real frontend domain's
+ * blkif ring to the character device.
+ */
+
+ if ( ! blktap_ring_ok ) {
+ DPRINTK("blktap: fe_ring not ready for a request!\n");
+ return 0;
+ }
+
+ if ( BLKTAP_RING_FULL(RING(&fe_ring)) ) {
+ DPRINTK("blktap: fe_ring is full, can't add.\n");
+ return 0;
+ }
+
+ target = &fe_ring.ring->ring[MASK_BLKIF_IDX(fe_ring.req_prod)].req;
+ memcpy(target, req, sizeof(*req));
+
+/* maybe move this stuff out into a seperate func ------------------- */
+
+ /*
+ * For now, map attached page into a fixed position into the vma.
+ * XXX: make this map to a free page.
+ */
+
+ /* Attempt to map the foreign pages directly in to the application */
+ for (i=0; i<target->nr_segments; i++) {
+
+ /* get an unused virtual address from the char device */
+ /* store the old page address */
+ /* replace the address with the virtual address */
+
+ /* blktap_vma->vm_start+((2+i)*PAGE_SIZE) */
+
+ error = direct_remap_area_pages(blktap_vma->vm_mm,
+ MMAP_VADDR(req->id, i),
+ target->frame_and_sects[0] & PAGE_MASK,
+ PAGE_SIZE,
+ blktap_vma->vm_page_prot,
+ ptfe_blkif.domid);
+ if ( error != 0 ) {
+ printk(KERN_INFO "remapping attached page failed! (%d)\n", error);
+ return 0;
+ }
+ }
+ /* fix the address of the attached page in the message. */
+ /* TODO: preserve the segment number stuff here... */
+ /* target->frame_and_sects[0] = blktap_vma->vm_start + PAGE_SIZE;*/
+/* ------------------------------------------------------------------ */
+
+
+ fe_ring.req_prod++;
+
+ return 0;
+}
+
+int blktap_write_be_ring(blkif_response_t *rsp)
+{
+ blkif_response_t *target;
+
+ /*
+ * This is called to pass a request from the real backend domain's
+ * blkif ring to the character device.
+ */
+
+ if ( ! blktap_ring_ok ) {
+ DPRINTK("blktap: be_ring not ready for a request!\n");
+ return 0;
+ }
+
+ if ( BLKTAP_RING_FULL(RING(&be_ring)) ) {
+ DPRINTK("blktap: be_ring is full, can't add.\n");
+ return 0;
+ }
+
+ target = &be_ring.ring->ring[MASK_BLKIF_IDX(be_ring.rsp_prod)].resp;
+ memcpy(target, rsp, sizeof(*rsp));
+
+
+ /* XXX: map attached pages and fix-up addresses in the copied address. */
+
+ be_ring.rsp_prod++;
+
+ return 0;
+}
+
+int blktap_read_fe_ring(void)
+{
+ /* This is called to read responses from the UFE ring. */
+
+ BLKIF_RING_IDX fe_rp;
+ unsigned long i;
+ int notify;
+
+ DPRINTK("blktap_read_fe_ring()\n");
+
+ fe_rp = fe_ring.ring->resp_prod;
+ rmb();
+ notify = (fe_rp != fe_ring.rsp_cons);
+
+ /* if we are forwarding from UFERring to FERing */
+ if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
+
+ /* for each outstanding message on the UFEring */
+ for ( i = fe_ring.rsp_cons; i != fe_rp; i++ ) {
+
+ /* XXX: remap pages on that message as necessary */
+ /* copy the message to the UBEring */
+
+ DPRINTK("resp->fe_ring\n");
+ write_resp_to_fe_ring(&fe_ring.ring->ring[MASK_BLKIF_IDX(i)].resp);
+ }
+
+ fe_ring.rsp_cons = fe_rp;
+
+ /* notify the fe if necessary */
+ if ( notify ) {
+ DPRINTK("kick_fe_domain()\n");
+ kick_fe_domain();
+ }
+ }
+
+ return 0;
+}
+
+int blktap_read_be_ring(void)
+{
+ /* This is called to read responses from the UBE ring. */
+
+ BLKIF_RING_IDX be_rp;
+ unsigned long i;
+ int notify;
+
+ DPRINTK("blktap_read_be_ring()\n");
+
+ be_rp = be_ring.ring->req_prod;
+ rmb();
+ notify = (be_rp != be_ring.req_cons);
+
+ /* if we are forwarding from UFERring to FERing */
+ if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) {
+
+ /* for each outstanding message on the UFEring */
+ for ( i = be_ring.req_cons; i != be_rp; i++ ) {
+
+ /* XXX: remap pages on that message as necessary */
+ /* copy the message to the UBEring */
+
+ DPRINTK("req->be_ring\n");
+ write_req_to_be_ring(&be_ring.ring->ring[MASK_BLKIF_IDX(i)].req);
+ }
+
+ be_ring.req_cons = be_rp;
+
+ /* notify the fe if necessary */
+ if ( notify ) {
+ DPRINTK("kick_be_domain()\n");
+ kick_be_domain();
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+/******************************************************************************
+ * blktap_userdev.c
+ *
+ * XenLinux virtual block-device tap.
+ * Control interface between the driver and a character device.
+ *
+ * Copyright (c) 2004, Andrew Warfield
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/gfp.h>
+#include <linux/poll.h>
+#include <asm/pgalloc.h>
+
+#include "blktap.h"
+
+
+unsigned long blktap_mode = BLKTAP_MODE_PASSTHROUGH;
+
+/* Only one process may open /dev/xen/blktap at any time. */
+static unsigned long blktap_dev_inuse;
+unsigned long blktap_ring_ok; /* make this ring->state */
+
+/* for poll: */
+static wait_queue_head_t blktap_wait;
+
+/* Where things are inside the device mapping. */
+struct vm_area_struct *blktap_vma;
+unsigned long mmap_vstart;
+unsigned long rings_vstart;
+
+/* -------[ blktap vm ops ]------------------------------------------- */
+
+static struct page *blktap_nopage(struct vm_area_struct *vma,
+ unsigned long address,
+ int *type)
+{
+ /*
+ * if the page has not been mapped in by the driver then generate
+ * a SIGBUS to the domain.
+ */
+
+ force_sig(SIGBUS, current);
+
+ return 0;
+}
+
+struct vm_operations_struct blktap_vm_ops = {
+ nopage: blktap_nopage,
+};
+
+/* -------[ blktap file ops ]----------------------------------------- */
+
+static int blktap_open(struct inode *inode, struct file *filp)
+{
+ if ( test_and_set_bit(0, &blktap_dev_inuse) )
+ return -EBUSY;
+
+ printk(KERN_ALERT "blktap open.\n");
+
+ /* Allocate the fe ring. */
+ fe_ring.ring = (blkif_ring_t *)get_zeroed_page(GFP_KERNEL);
+ if (fe_ring.ring == NULL)
+ goto fail_nomem;
+
+ SetPageReserved(virt_to_page(fe_ring.ring));
+
+ fe_ring.ring->req_prod = fe_ring.ring->resp_prod
+ = fe_ring.req_prod
+ = fe_ring.rsp_cons
+ = 0;
+
+ /* Allocate the be ring. */
+ be_ring.ring = (blkif_ring_t *)get_zeroed_page(GFP_KERNEL);
+ if (be_ring.ring == NULL)
+ goto fail_free_fe;
+
+ SetPageReserved(virt_to_page(be_ring.ring));
+
+ be_ring.ring->req_prod = be_ring.ring->resp_prod
+ = be_ring.rsp_prod
+ = be_ring.req_cons
+ = 0;
+
+ DPRINTK(KERN_ALERT "blktap open.\n");
+
+ return 0;
+
+ fail_free_fe:
+ free_page( (unsigned long) fe_ring.ring);
+
+ fail_nomem:
+ return -ENOMEM;
+}
+
+static int blktap_release(struct inode *inode, struct file *filp)
+{
+ blktap_dev_inuse = 0;
+ blktap_ring_ok = 0;
+
+ printk(KERN_ALERT "blktap closed.\n");
+
+ /* Free the ring page. */
+ ClearPageReserved(virt_to_page(fe_ring.ring));
+ free_page((unsigned long) fe_ring.ring);
+
+ ClearPageReserved(virt_to_page(be_ring.ring));
+ free_page((unsigned long) be_ring.ring);
+
+ return 0;
+}
+
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ int size;
+
+ printk(KERN_ALERT "blktap mmap (%lx, %lx)\n",
+ vma->vm_start, vma->vm_end);
+
+ vma->vm_ops = &blktap_vm_ops;
+
+ size = vma->vm_end - vma->vm_start;
+ if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) {
+ printk(KERN_INFO
+ "blktap: you _must_ map exactly %d pages!\n",
+ MMAP_PAGES + RING_PAGES);
+ return -EAGAIN;
+ }
+
+ size >>= PAGE_SHIFT;
+ printk(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
+
+ rings_vstart = vma->vm_start;
+ mmap_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT);
+
+ /* Map the ring pages to the start of the region and reserve it. */
+
+ /* not sure if I really need to do this... */
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ DPRINTK("Mapping be_ring page %lx.\n", __pa(be_ring.ring));
+ if (remap_page_range(vma, vma->vm_start, __pa(be_ring.ring), PAGE_SIZE,
+ vma->vm_page_prot)) {
+ printk(KERN_ERR "be_ring: remap_page_range failure!\n");
+ }
+
+ DPRINTK("Mapping fe_ring page %lx.\n", __pa(fe_ring.ring));
+ if (remap_page_range(vma, vma->vm_start + PAGE_SIZE, __pa(fe_ring.ring),
+ PAGE_SIZE, vma->vm_page_prot)) {
+ printk(KERN_ERR "fe_ring: remap_page_range failure!\n");
+ }
+
+ blktap_vma = vma;
+ blktap_ring_ok = 1;
+
+ return 0;
+}
+
+static int blktap_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ switch(cmd) {
+ case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
+ return blktap_read_fe_ring();
+
+ case BLKTAP_IOCTL_KICK_BE: /* There are be messages to process. */
+ return blktap_read_be_ring();
+
+ case BLKTAP_IOCTL_SETMODE:
+ if (BLKTAP_MODE_VALID(arg)) {
+ blktap_mode = arg;
+ /* XXX: may need to flush rings here. */
+ printk(KERN_INFO "blktap: set mode to %lx\n", arg);
+ return 0;
+ }
+ /* XXX: return a more meaningful error case here. */
+ }
+ return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &blktap_wait, wait);
+
+ if ( (fe_ring.req_prod != fe_ring.ring->req_prod) ||
+ (be_ring.rsp_prod != be_ring.ring->resp_prod) ) {
+
+ fe_ring.ring->req_prod = fe_ring.req_prod;
+ be_ring.ring->resp_prod = be_ring.rsp_prod;
+ return POLLIN | POLLRDNORM;
+ }
+
+ return 0;
+}
+
+void blktap_kick_user(void)
+{
+ /* blktap_ring->req_prod = blktap_req_prod; */
+ wake_up_interruptible(&blktap_wait);
+}
+
+static struct file_operations blktap_fops = {
+ owner: THIS_MODULE,
+ poll: blktap_poll,
+ ioctl: blktap_ioctl,
+ open: blktap_open,
+ release: blktap_release,
+ mmap: blktap_mmap,
+};
+
+/* -------[ blktap module setup ]------------------------------------- */
+
+static struct miscdevice blktap_miscdev = {
+ .minor = BLKTAP_MINOR,
+ .name = "blktap",
+ .fops = &blktap_fops,
+ .devfs_name = "misc/blktap",
+};
+
+int blktap_init(void)
+{
+ int err;
+
+ err = misc_register(&blktap_miscdev);
+ if ( err != 0 )
+ {
+ printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err);
+ return err;
+ }
+
+ init_waitqueue_head(&blktap_wait);
+
+
+ return 0;
+}
--- /dev/null
+/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws
+ * which needs to alter them. */
+
+static inline void smpboot_clear_io_apic_irqs(void)
+{
+#if 1
+ printk("smpboot_clear_io_apic_irqs\n");
+#else
+ io_apic_irqs = 0;
+#endif
+}
+
+static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
+{
+#if 1
+ printk("smpboot_setup_warm_reset_vector\n");
+#else
+ CMOS_WRITE(0xa, 0xf);
+ local_flush_tlb();
+ Dprintk("1.\n");
+ *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
+ Dprintk("2.\n");
+ *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
+ Dprintk("3.\n");
+#endif
+}
+
+static inline void smpboot_restore_warm_reset_vector(void)
+{
+ /*
+ * Install writable page 0 entry to set BIOS data area.
+ */
+ local_flush_tlb();
+
+ /*
+ * Paranoid: Set warm reset code and vector here back
+ * to default values.
+ */
+ CMOS_WRITE(0, 0xf);
+
+ *((volatile long *) phys_to_virt(0x467)) = 0;
+}
+
+static inline void smpboot_setup_io_apic(void)
+{
+#if 1
+ printk("smpboot_setup_io_apic\n");
+#else
+ /*
+ * Here we can be sure that there is an IO-APIC in the system. Let's
+ * go and set it up:
+ */
+ if (!skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+#endif
+}
+
+
+#define smp_found_config (HYPERVISOR_shared_info->n_vcpu > 1)
--- /dev/null
+#ifndef __ASM_SPINLOCK_H
+#define __ASM_SPINLOCK_H
+
+#include <asm/atomic.h>
+#include <asm/rwlock.h>
+#include <asm/page.h>
+#include <linux/config.h>
+#include <linux/compiler.h>
+
+asmlinkage int printk(const char * fmt, ...)
+ __attribute__ ((format (printf, 1, 2)));
+
+/*
+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
+ */
+
+typedef struct {
+ volatile unsigned int lock;
+#ifdef CONFIG_DEBUG_SPINLOCK
+ unsigned magic;
+#endif
+} spinlock_t;
+
+#define SPINLOCK_MAGIC 0xdead4ead
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC
+#else
+#define SPINLOCK_MAGIC_INIT /* */
+#endif
+
+#define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT }
+
+#define spin_lock_init(x) do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
+
+/*
+ * Simple spin lock operations. There are two variants, one clears IRQ's
+ * on the local processor, one does not.
+ *
+ * We make no fairness assumptions. They have a cost.
+ */
+
+#define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0)
+#define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x))
+
+#define spin_lock_string \
+ "\n1:\t" \
+ "lock ; decb %0\n\t" \
+ "jns 3f\n" \
+ "2:\t" \
+ "rep;nop\n\t" \
+ "cmpb $0,%0\n\t" \
+ "jle 2b\n\t" \
+ "jmp 1b\n" \
+ "3:\n\t"
+
+#define spin_lock_string_flags \
+ "\n1:\t" \
+ "lock ; decb %0\n\t" \
+ "jns 4f\n\t" \
+ "2:\t" \
+ "testl $0x200, %1\n\t" \
+ "jz 3f\n\t" \
+ "#sti\n\t" \
+ "3:\t" \
+ "rep;nop\n\t" \
+ "cmpb $0, %0\n\t" \
+ "jle 3b\n\t" \
+ "#cli\n\t" \
+ "jmp 1b\n" \
+ "4:\n\t"
+
+/*
+ * This works. Despite all the confusion.
+ * (except on PPro SMP or if we are using OOSTORE)
+ * (PPro errata 66, 92)
+ */
+
+#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
+
+#define spin_unlock_string \
+ "movb $1,%0" \
+ :"=m" (lock->lock) : : "memory"
+
+
+static inline void _raw_spin_unlock(spinlock_t *lock)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ BUG_ON(lock->magic != SPINLOCK_MAGIC);
+ BUG_ON(!spin_is_locked(lock));
+#endif
+ __asm__ __volatile__(
+ spin_unlock_string
+ );
+}
+
+#else
+
+#define spin_unlock_string \
+ "xchgb %b0, %1" \
+ :"=q" (oldval), "=m" (lock->lock) \
+ :"0" (oldval) : "memory"
+
+static inline void _raw_spin_unlock(spinlock_t *lock)
+{
+ char oldval = 1;
+#ifdef CONFIG_DEBUG_SPINLOCK
+ BUG_ON(lock->magic != SPINLOCK_MAGIC);
+ BUG_ON(!spin_is_locked(lock));
+#endif
+ __asm__ __volatile__(
+ spin_unlock_string
+ );
+}
+
+#endif
+
+static inline int _raw_spin_trylock(spinlock_t *lock)
+{
+ char oldval;
+ __asm__ __volatile__(
+ "xchgb %b0,%1"
+ :"=q" (oldval), "=m" (lock->lock)
+ :"0" (0) : "memory");
+ return oldval > 0;
+}
+
+static inline void _raw_spin_lock(spinlock_t *lock)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
+ printk("eip: %p\n", __builtin_return_address(0));
+ BUG();
+ }
+#endif
+ __asm__ __volatile__(
+ spin_lock_string
+ :"=m" (lock->lock) : : "memory");
+}
+
+static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
+ printk("eip: %p\n", __builtin_return_address(0));
+ BUG();
+ }
+#endif
+ __asm__ __volatile__(
+ spin_lock_string_flags
+ :"=m" (lock->lock) : "r" (flags) : "memory");
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ */
+typedef struct {
+ volatile unsigned int lock;
+#ifdef CONFIG_DEBUG_SPINLOCK
+ unsigned magic;
+#endif
+} rwlock_t;
+
+#define RWLOCK_MAGIC 0xdeaf1eed
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define RWLOCK_MAGIC_INIT , RWLOCK_MAGIC
+#else
+#define RWLOCK_MAGIC_INIT /* */
+#endif
+
+#define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
+
+#define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0)
+
+#define rwlock_is_locked(x) ((x)->lock != RW_LOCK_BIAS)
+
+/*
+ * On x86, we implement read-write locks as a 32-bit counter
+ * with the high bit (sign) being the "contended" bit.
+ *
+ * The inline assembly is non-obvious. Think about it.
+ *
+ * Changed to use the same technique as rw semaphores. See
+ * semaphore.h for details. -ben
+ */
+/* the spinlock helpers are in arch/i386/kernel/semaphore.c */
+
+static inline void _raw_read_lock(rwlock_t *rw)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ BUG_ON(rw->magic != RWLOCK_MAGIC);
+#endif
+ __build_read_lock(rw, "__read_lock_failed");
+}
+
+static inline void _raw_write_lock(rwlock_t *rw)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ BUG_ON(rw->magic != RWLOCK_MAGIC);
+#endif
+ __build_write_lock(rw, "__write_lock_failed");
+}
+
+#define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
+#define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
+
+static inline int _raw_write_trylock(rwlock_t *lock)
+{
+ atomic_t *count = (atomic_t *)lock;
+ if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+ return 1;
+ atomic_add(RW_LOCK_BIAS, count);
+ return 0;
+}
+
+#endif /* __ASM_SPINLOCK_H */
+++ /dev/null
-/*
- * Intel SMP support routines.
- *
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
- *
- * This code is released under the GNU General Public License version 2 or
- * later.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/irq.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/smp_lock.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/cache.h>
-#include <linux/interrupt.h>
-
-#include <asm/mtrr.h>
-#include <asm/tlbflush.h>
-#if 0
-#include <mach_apic.h>
-#endif
-#include <asm-xen/evtchn.h>
-
-#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg)
-
-/*
- * Some notes on x86 processor bugs affecting SMP operation:
- *
- * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
- * The Linux implications for SMP are handled as follows:
- *
- * Pentium III / [Xeon]
- * None of the E1AP-E3AP errata are visible to the user.
- *
- * E1AP. see PII A1AP
- * E2AP. see PII A2AP
- * E3AP. see PII A3AP
- *
- * Pentium II / [Xeon]
- * None of the A1AP-A3AP errata are visible to the user.
- *
- * A1AP. see PPro 1AP
- * A2AP. see PPro 2AP
- * A3AP. see PPro 7AP
- *
- * Pentium Pro
- * None of 1AP-9AP errata are visible to the normal user,
- * except occasional delivery of 'spurious interrupt' as trap #15.
- * This is very rare and a non-problem.
- *
- * 1AP. Linux maps APIC as non-cacheable
- * 2AP. worked around in hardware
- * 3AP. fixed in C0 and above steppings microcode update.
- * Linux does not use excessive STARTUP_IPIs.
- * 4AP. worked around in hardware
- * 5AP. symmetric IO mode (normal Linux operation) not affected.
- * 'noapic' mode has vector 0xf filled out properly.
- * 6AP. 'noapic' mode might be affected - fixed in later steppings
- * 7AP. We do not assume writes to the LVT deassering IRQs
- * 8AP. We do not enable low power mode (deep sleep) during MP bootup
- * 9AP. We do not use mixed mode
- *
- * Pentium
- * There is a marginal case where REP MOVS on 100MHz SMP
- * machines with B stepping processors can fail. XXX should provide
- * an L1cache=Writethrough or L1cache=off option.
- *
- * B stepping CPUs may hang. There are hardware work arounds
- * for this. We warn about it in case your board doesn't have the work
- * arounds. Basically thats so I can tell anyone with a B stepping
- * CPU and SMP problems "tough".
- *
- * Specific items [From Pentium Processor Specification Update]
- *
- * 1AP. Linux doesn't use remote read
- * 2AP. Linux doesn't trust APIC errors
- * 3AP. We work around this
- * 4AP. Linux never generated 3 interrupts of the same priority
- * to cause a lost local interrupt.
- * 5AP. Remote read is never used
- * 6AP. not affected - worked around in hardware
- * 7AP. not affected - worked around in hardware
- * 8AP. worked around in hardware - we get explicit CS errors if not
- * 9AP. only 'noapic' mode affected. Might generate spurious
- * interrupts, we log only the first one and count the
- * rest silently.
- * 10AP. not affected - worked around in hardware
- * 11AP. Linux reads the APIC between writes to avoid this, as per
- * the documentation. Make sure you preserve this as it affects
- * the C stepping chips too.
- * 12AP. not affected - worked around in hardware
- * 13AP. not affected - worked around in hardware
- * 14AP. we always deassert INIT during bootup
- * 15AP. not affected - worked around in hardware
- * 16AP. not affected - worked around in hardware
- * 17AP. not affected - worked around in hardware
- * 18AP. not affected - worked around in hardware
- * 19AP. not affected - worked around in BIOS
- *
- * If this sounds worrying believe me these bugs are either ___RARE___,
- * or are signal timing bugs worked around in hardware and there's
- * about nothing of note with C stepping upwards.
- */
-
-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
-
-/*
- * the following functions deal with sending IPIs between CPUs.
- *
- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
- */
-
-static inline int __prepare_ICR (unsigned int shortcut, int vector)
-{
- return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
-}
-
-static inline int __prepare_ICR2 (unsigned int mask)
-{
- return SET_APIC_DEST_FIELD(mask);
-}
-
-DECLARE_PER_CPU(int, ipi_to_evtchn[NR_IPIS]);
-
-static inline void __send_IPI_one(unsigned int cpu, int vector)
-{
- unsigned int evtchn;
-
- evtchn = per_cpu(ipi_to_evtchn, cpu)[vector];
- // printk("send_IPI_mask_bitmask cpu %d vector %d evtchn %d\n", cpu, vector, evtchn);
- if (evtchn) {
-#if 0
- shared_info_t *s = HYPERVISOR_shared_info;
- while (synch_test_bit(evtchn, &s->evtchn_pending[0]) ||
- synch_test_bit(evtchn, &s->evtchn_mask[0]))
- ;
-#endif
- notify_via_evtchn(evtchn);
- } else
- printk("send_IPI to unbound port %d/%d",
- cpu, vector);
-}
-
-void __send_IPI_shortcut(unsigned int shortcut, int vector)
-{
- int cpu;
-
- switch (shortcut) {
- case APIC_DEST_SELF:
- __send_IPI_one(smp_processor_id(), vector);
- break;
- case APIC_DEST_ALLBUT:
- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
- if (cpu == smp_processor_id())
- continue;
- if (cpu_isset(cpu, cpu_online_map)) {
- __send_IPI_one(cpu, vector);
- }
- }
- break;
- default:
- printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
- vector);
- break;
- }
-}
-
-void fastcall send_IPI_self(int vector)
-{
- __send_IPI_shortcut(APIC_DEST_SELF, vector);
-}
-
-/*
- * This is only used on smaller machines.
- */
-void send_IPI_mask_bitmask(cpumask_t mask, int vector)
-{
- unsigned long flags;
- unsigned int cpu;
-
- local_irq_save(flags);
-
- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
- if (cpu_isset(cpu, mask)) {
- __send_IPI_one(cpu, vector);
- }
- }
-
- local_irq_restore(flags);
-}
-
-inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
-{
-
- send_IPI_mask_bitmask(mask, vector);
-}
-
-#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
-
-/*
- * Smarter SMP flushing macros.
- * c/o Linus Torvalds.
- *
- * These mean you can really definitely utterly forget about
- * writing to user space from interrupts. (Its not allowed anyway).
- *
- * Optimizations Manfred Spraul <manfred@colorfullife.com>
- */
-
-static cpumask_t flush_cpumask;
-static struct mm_struct * flush_mm;
-static unsigned long flush_va;
-static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED;
-#define FLUSH_ALL 0xffffffff
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- *
- * We need to reload %cr3 since the page tables may be going
- * away from under us..
- */
-static inline void leave_mm (unsigned long cpu)
-{
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
- BUG();
- cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
- load_cr3(swapper_pg_dir);
-}
-
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * Stop ipi delivery for the old mm. This is not synchronized with
- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
- * for the wrong mm, and in the worst case we perform a superflous
- * tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- * was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
- * Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- * Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- * cpu_tlbstate[].active_mm is correct, cpu0 already handles
- * flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * Atomically set the bit [other cpus will start sending flush ipis],
- * and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- * runs in kernel space, the cpu could load tlb entries for user space
- * pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
-
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-
-irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
- struct pt_regs *regs)
-{
- unsigned long cpu;
-
- cpu = get_cpu();
-
- if (!cpu_isset(cpu, flush_cpumask))
- goto out;
- /*
- * This was a BUG() but until someone can quote me the
- * line from the intel manual that guarantees an IPI to
- * multiple CPUs is retried _only_ on the erroring CPUs
- * its staying as a return
- *
- * BUG();
- */
-
- if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
- if (flush_va == FLUSH_ALL)
- local_flush_tlb();
- else
- __flush_tlb_one(flush_va);
- } else
- leave_mm(cpu);
- }
- smp_mb__before_clear_bit();
- cpu_clear(cpu, flush_cpumask);
- smp_mb__after_clear_bit();
-out:
- put_cpu_no_resched();
-
- return IRQ_HANDLED;
-}
-
-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
- unsigned long va)
-{
- cpumask_t tmp;
- /*
- * A couple of (to be removed) sanity checks:
- *
- * - we do not send IPIs to not-yet booted CPUs.
- * - current CPU must not be in mask
- * - mask must exist :)
- */
- BUG_ON(cpus_empty(cpumask));
-
- cpus_and(tmp, cpumask, cpu_online_map);
- BUG_ON(!cpus_equal(cpumask, tmp));
- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
- BUG_ON(!mm);
-
- /*
- * i'm not happy about this global shared spinlock in the
- * MM hot path, but we'll see how contended it is.
- * Temporarily this turns IRQs off, so that lockups are
- * detected by the NMI watchdog.
- */
- spin_lock(&tlbstate_lock);
-
- flush_mm = mm;
- flush_va = va;
-#if NR_CPUS <= BITS_PER_LONG
- atomic_set_mask(cpumask, &flush_cpumask);
-#else
- {
- int k;
- unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
- unsigned long *cpu_mask = (unsigned long *)&cpumask;
- for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
- atomic_set_mask(cpu_mask[k], &flush_mask[k]);
- }
-#endif
- /*
- * We have to send the IPI only to
- * CPUs affected.
- */
- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
-
- while (!cpus_empty(flush_cpumask))
- /* nothing. lockup detection does not belong here */
- mb();
-
- flush_mm = NULL;
- flush_va = 0;
- spin_unlock(&tlbstate_lock);
-}
-
-void flush_tlb_current_task(void)
-{
- struct mm_struct *mm = current->mm;
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- local_flush_tlb();
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
- preempt_enable();
-}
-
-void flush_tlb_mm (struct mm_struct * mm)
-{
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- if (current->active_mm == mm) {
- if (current->mm)
- local_flush_tlb();
- else
- leave_mm(smp_processor_id());
- }
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
-
- preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
-{
- struct mm_struct *mm = vma->vm_mm;
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- if (current->active_mm == mm) {
- if(current->mm)
- __flush_tlb_one(va);
- else
- leave_mm(smp_processor_id());
- }
-
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, va);
-
- preempt_enable();
-}
-
-static void do_flush_tlb_all(void* info)
-{
- unsigned long cpu = smp_processor_id();
-
- __flush_tlb_all();
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
- leave_mm(cpu);
-}
-
-void flush_tlb_all(void)
-{
- on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
-}
-
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-void smp_send_reschedule(int cpu)
-{
- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
-}
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
-
-struct call_data_struct {
- void (*func) (void *info);
- void *info;
- atomic_t started;
- atomic_t finished;
- int wait;
-};
-
-static struct call_data_struct * call_data;
-
-/*
- * this function sends a 'generic call function' IPI to all other CPUs
- * in the system.
- */
-
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
- int wait)
-/*
- * [SUMMARY] Run a function on all other CPUs.
- * <func> The function to run. This must be fast and non-blocking.
- * <info> An arbitrary pointer to pass to the function.
- * <nonatomic> currently unused.
- * <wait> If true, wait (atomically) until function has completed on other CPUs.
- * [RETURNS] 0 on success, else a negative status code. Does not return until
- * remote CPUs are nearly ready to execute <<func>> or are or have executed.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-{
- struct call_data_struct data;
- int cpus = num_online_cpus()-1;
-
- if (!cpus)
- return 0;
-
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
-
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
-
- spin_lock(&call_lock);
- call_data = &data;
- mb();
-
- /* Send a message to all other CPUs and wait for them to respond */
- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-
- /* Wait for response */
- while (atomic_read(&data.started) != cpus)
- barrier();
-
- if (wait)
- while (atomic_read(&data.finished) != cpus)
- barrier();
- spin_unlock(&call_lock);
-
- return 0;
-}
-
-static void stop_this_cpu (void * dummy)
-{
- /*
- * Remove this CPU:
- */
- cpu_clear(smp_processor_id(), cpu_online_map);
- local_irq_disable();
-#if 1
- xxprint("stop_this_cpu disable_local_APIC\n");
-#else
- disable_local_APIC();
-#endif
- if (cpu_data[smp_processor_id()].hlt_works_ok)
- for(;;) __asm__("hlt");
- for (;;);
-}
-
-/*
- * this function calls the 'stop' function on all other CPUs in the system.
- */
-
-void smp_send_stop(void)
-{
- smp_call_function(stop_this_cpu, NULL, 1, 0);
-
- local_irq_disable();
-#if 1
- xxprint("smp_send_stop disable_local_APIC\n");
-#else
- disable_local_APIC();
-#endif
- local_irq_enable();
-}
-
-/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
- */
-irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
- struct pt_regs *regs)
-{
-
- return IRQ_HANDLED;
-}
-
-#include <linux/kallsyms.h>
-irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
- struct pt_regs *regs)
-{
- void (*func) (void *info) = call_data->func;
- void *info = call_data->info;
- int wait = call_data->wait;
-
- /*
- * Notify initiating CPU that I've grabbed the data and am
- * about to execute the function
- */
- mb();
- atomic_inc(&call_data->started);
- /*
- * At this point the info structure may be out of scope unless wait==1
- */
- irq_enter();
- (*func)(info);
- irq_exit();
-
- if (wait) {
- mb();
- atomic_inc(&call_data->finished);
- }
-
- return IRQ_HANDLED;
-}
-
+++ /dev/null
-/*
- * x86 SMP booting functions
- *
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- * Much of the core SMP work is based on previous work by Thomas Radke, to
- * whom a great many thanks are extended.
- *
- * Thanks to Intel for making available several different Pentium,
- * Pentium Pro and Pentium-II/Xeon MP machines.
- * Original development of Linux SMP code supported by Caldera.
- *
- * This code is released under the GNU General Public License version 2 or
- * later.
- *
- * Fixes
- * Felix Koop : NR_CPUS used properly
- * Jose Renau : Handle single CPU case.
- * Alan Cox : By repeated request 8) - Total BogoMIPS report.
- * Greg Wright : Fix for kernel stacks panic.
- * Erich Boleyn : MP v1.4 and additional changes.
- * Matthias Sattler : Changes for 2.1 kernel map.
- * Michel Lespinasse : Changes for 2.1 kernel map.
- * Michael Chastain : Change trampoline.S to gnu as.
- * Alan Cox : Dumb bug: 'B' step PPro's are fine
- * Ingo Molnar : Added APIC timers, based on code
- * from Jose Renau
- * Ingo Molnar : various cleanups and rewrites
- * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
- * Maciej W. Rozycki : Bits for genuine 82489DX APICs
- * Martin J. Bligh : Added support for multi-quad systems
- * Dave Jones : Report invalid combinations of Athlon CPUs.
-* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
-
-#include <linux/module.h>
-#include <linux/config.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/kernel_stat.h>
-#include <linux/smp_lock.h>
-#include <linux/irq.h>
-#include <linux/bootmem.h>
-
-#include <linux/delay.h>
-#include <linux/mc146818rtc.h>
-#include <asm/tlbflush.h>
-#include <asm/desc.h>
-#include <asm/arch_hooks.h>
-
-#if 1
-#define Dprintk(args...)
-#else
-#include <mach_apic.h>
-#endif
-#include <mach_wakecpu.h>
-#include <smpboot_hooks.h>
-
-/* Set if we find a B stepping CPU */
-static int __initdata smp_b_stepping;
-
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
-
-/* bitmap of online cpus */
-cpumask_t cpu_online_map;
-
-static cpumask_t cpu_callin_map;
-cpumask_t cpu_callout_map;
-static cpumask_t smp_commenced_mask;
-
-/* Per CPU bogomips and other parameters */
-struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
-
-u8 x86_cpu_to_apicid[NR_CPUS] =
- { [0 ... NR_CPUS-1] = 0xff };
-EXPORT_SYMBOL(x86_cpu_to_apicid);
-
-/* Set when the idlers are all forked */
-int smp_threads_ready;
-
-#if 0
-/*
- * Trampoline 80x86 program as an array.
- */
-
-extern unsigned char trampoline_data [];
-extern unsigned char trampoline_end [];
-static unsigned char *trampoline_base;
-static int trampoline_exec;
-
-/*
- * Currently trivial. Write the real->protected mode
- * bootstrap into the page concerned. The caller
- * has made sure it's suitably aligned.
- */
-
-static unsigned long __init setup_trampoline(void)
-{
- memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
- return virt_to_phys(trampoline_base);
-}
-#endif
-
-/*
- * We are called very early to get the low memory for the
- * SMP bootup trampoline page.
- */
-void __init smp_alloc_memory(void)
-{
-#if 1
- int cpu;
-
- for (cpu = 1; cpu < NR_CPUS; cpu++) {
- cpu_gdt_descr[cpu].address = (unsigned long)
- alloc_bootmem_low_pages(PAGE_SIZE);
- /* XXX free unused pages later */
- }
-#else
- trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
- /*
- * Has to be in very low memory so we can execute
- * real-mode AP code.
- */
- if (__pa(trampoline_base) >= 0x9F000)
- BUG();
- /*
- * Make the SMP trampoline executable:
- */
- trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
-#endif
-}
-
-/*
- * The bootstrap kernel entry code has set these up. Save them for
- * a given CPU
- */
-
-static void __init smp_store_cpu_info(int id)
-{
- struct cpuinfo_x86 *c = cpu_data + id;
-
- *c = boot_cpu_data;
- if (id!=0)
- identify_cpu(c);
- /*
- * Mask B, Pentium, but not Pentium MMX
- */
- if (c->x86_vendor == X86_VENDOR_INTEL &&
- c->x86 == 5 &&
- c->x86_mask >= 1 && c->x86_mask <= 4 &&
- c->x86_model <= 3)
- /*
- * Remember we have B step Pentia with bugs
- */
- smp_b_stepping = 1;
-
- /*
- * Certain Athlons might work (for various values of 'work') in SMP
- * but they are not certified as MP capable.
- */
- if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
- /* Athlon 660/661 is valid. */
- if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
- goto valid_k7;
-
- /* Duron 670 is valid */
- if ((c->x86_model==7) && (c->x86_mask==0))
- goto valid_k7;
-
- /*
- * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
- * It's worth noting that the A5 stepping (662) of some Athlon XP's
- * have the MP bit set.
- * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
- */
- if (((c->x86_model==6) && (c->x86_mask>=2)) ||
- ((c->x86_model==7) && (c->x86_mask>=1)) ||
- (c->x86_model> 7))
- if (cpu_has_mp)
- goto valid_k7;
-
- /* If we get here, it's not a certified SMP capable AMD system. */
- tainted |= TAINT_UNSAFE_SMP;
- }
-
-valid_k7:
- ;
-}
-
-#if 0
-/*
- * TSC synchronization.
- *
- * We first check whether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
- */
-
-static atomic_t tsc_start_flag = ATOMIC_INIT(0);
-static atomic_t tsc_count_start = ATOMIC_INIT(0);
-static atomic_t tsc_count_stop = ATOMIC_INIT(0);
-static unsigned long long tsc_values[NR_CPUS];
-
-#define NR_LOOPS 5
-
-static void __init synchronize_tsc_bp (void)
-{
- int i;
- unsigned long long t0;
- unsigned long long sum, avg;
- long long delta;
- unsigned long one_usec;
- int buggy = 0;
-
- printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
-
- /* convert from kcyc/sec to cyc/usec */
- one_usec = cpu_khz / 1000;
-
- atomic_set(&tsc_start_flag, 1);
- wmb();
-
- /*
- * We loop a few times to get a primed instruction cache,
- * then the last pass is more or less synchronized and
- * the BP and APs set their cycle counters to zero all at
- * once. This reduces the chance of having random offsets
- * between the processors, and guarantees that the maximum
- * delay between the cycle counters is never bigger than
- * the latency of information-passing (cachelines) between
- * two CPUs.
- */
- for (i = 0; i < NR_LOOPS; i++) {
- /*
- * all APs synchronize but they loop on '== num_cpus'
- */
- while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
- mb();
- atomic_set(&tsc_count_stop, 0);
- wmb();
- /*
- * this lets the APs save their current TSC:
- */
- atomic_inc(&tsc_count_start);
-
- rdtscll(tsc_values[smp_processor_id()]);
- /*
- * We clear the TSC in the last loop:
- */
- if (i == NR_LOOPS-1)
- write_tsc(0, 0);
-
- /*
- * Wait for all APs to leave the synchronization point:
- */
- while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
- mb();
- atomic_set(&tsc_count_start, 0);
- wmb();
- atomic_inc(&tsc_count_stop);
- }
-
- sum = 0;
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_isset(i, cpu_callout_map)) {
- t0 = tsc_values[i];
- sum += t0;
- }
- }
- avg = sum;
- do_div(avg, num_booting_cpus());
-
- sum = 0;
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_isset(i, cpu_callout_map))
- continue;
- delta = tsc_values[i] - avg;
- if (delta < 0)
- delta = -delta;
- /*
- * We report bigger than 2 microseconds clock differences.
- */
- if (delta > 2*one_usec) {
- long realdelta;
- if (!buggy) {
- buggy = 1;
- printk("\n");
- }
- realdelta = delta;
- do_div(realdelta, one_usec);
- if (tsc_values[i] < avg)
- realdelta = -realdelta;
-
- printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta);
- }
-
- sum += delta;
- }
- if (!buggy)
- printk("passed.\n");
-}
-
-static void __init synchronize_tsc_ap (void)
-{
- int i;
-
- /*
- * Not every cpu is online at the time
- * this gets called, so we first wait for the BP to
- * finish SMP initialization:
- */
- while (!atomic_read(&tsc_start_flag)) mb();
-
- for (i = 0; i < NR_LOOPS; i++) {
- atomic_inc(&tsc_count_start);
- while (atomic_read(&tsc_count_start) != num_booting_cpus())
- mb();
-
- rdtscll(tsc_values[smp_processor_id()]);
- if (i == NR_LOOPS-1)
- write_tsc(0, 0);
-
- atomic_inc(&tsc_count_stop);
- while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
- }
-}
-#undef NR_LOOPS
-#endif
-
-extern void calibrate_delay(void);
-
-static atomic_t init_deasserted;
-
-void __init smp_callin(void)
-{
- int cpuid, phys_id;
- unsigned long timeout;
-
-#if 0
- /*
- * If waken up by an INIT in an 82489DX configuration
- * we may get here before an INIT-deassert IPI reaches
- * our local APIC. We have to wait for the IPI or we'll
- * lock up on an APIC access.
- */
- wait_for_init_deassert(&init_deasserted);
-#endif
-
- /*
- * (This works even if the APIC is not enabled.)
- */
- phys_id = smp_processor_id();
- cpuid = smp_processor_id();
- if (cpu_isset(cpuid, cpu_callin_map)) {
- printk("huh, phys CPU#%d, CPU#%d already present??\n",
- phys_id, cpuid);
- BUG();
- }
- Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
-
- /*
- * STARTUP IPIs are fragile beasts as they might sometimes
- * trigger some glue motherboard logic. Complete APIC bus
- * silence for 1 second, this overestimates the time the
- * boot CPU is spending to send the up to 2 STARTUP IPIs
- * by a factor of two. This should be enough.
- */
-
- /*
- * Waiting 2s total for startup (udelay is not yet working)
- */
- timeout = jiffies + 2*HZ;
- while (time_before(jiffies, timeout)) {
- /*
- * Has the boot CPU finished it's STARTUP sequence?
- */
- if (cpu_isset(cpuid, cpu_callout_map))
- break;
- rep_nop();
- }
-
- if (!time_before(jiffies, timeout)) {
- printk("BUG: CPU%d started up but did not get a callout!\n",
- cpuid);
- BUG();
- }
-
-#if 0
- /*
- * the boot CPU has finished the init stage and is spinning
- * on callin_map until we finish. We are free to set up this
- * CPU, first the APIC. (this is probably redundant on most
- * boards)
- */
-
- Dprintk("CALLIN, before setup_local_APIC().\n");
- smp_callin_clear_local_apic();
- setup_local_APIC();
-#endif
- map_cpu_to_logical_apicid();
-
- local_irq_enable();
-
- /*
- * Get our bogomips.
- */
- calibrate_delay();
- Dprintk("Stack at about %p\n",&cpuid);
-
- /*
- * Save our processor parameters
- */
- smp_store_cpu_info(cpuid);
-
-#if 0
- disable_APIC_timer();
-#endif
- local_irq_disable();
- /*
- * Allow the master to continue.
- */
- cpu_set(cpuid, cpu_callin_map);
-
-#if 0
- /*
- * Synchronize the TSC with the BP
- */
- if (cpu_has_tsc && cpu_khz)
- synchronize_tsc_ap();
-#endif
-}
-
-int cpucount;
-
-extern int cpu_idle(void);
-
-
-static irqreturn_t local_debug_interrupt(int irq, void *dev_id,
- struct pt_regs *regs)
-{
-
- return IRQ_HANDLED;
-}
-
-static struct irqaction local_irq_debug = {
- local_debug_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "ldebug",
- NULL, NULL
-};
-
-void local_setup_debug(void)
-{
- (void)setup_irq(bind_virq_to_irq(VIRQ_DEBUG), &local_irq_debug);
-}
-
-
-extern void local_setup_timer(void);
-
-/*
- * Activate a secondary processor.
- */
-int __init start_secondary(void *unused)
-{
- /*
- * Dont put anything before smp_callin(), SMP
- * booting is too fragile that we want to limit the
- * things done here to the most necessary things.
- */
- cpu_init();
- smp_callin();
- while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
- rep_nop();
- local_setup_timer();
- local_setup_debug(); /* XXX */
- smp_intr_init();
- local_irq_enable();
- /*
- * low-memory mappings have been cleared, flush them from
- * the local TLBs too.
- */
- local_flush_tlb();
- cpu_set(smp_processor_id(), cpu_online_map);
- wmb();
- if (0) {
- char *msg2 = "delay2\n";
- int timeout;
- for (timeout = 0; timeout < 50000; timeout++) {
- udelay(1000);
- if (timeout == 2000) {
- (void)HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg2), msg2);
- timeout = 0;
- }
- }
- }
- return cpu_idle();
-}
-
-/*
- * Everything has been set up for the secondary
- * CPUs - they just need to reload everything
- * from the task structure
- * This function must not return.
- */
-void __init initialize_secondary(void)
-{
- /*
- * We don't actually need to load the full TSS,
- * basically just the stack pointer and the eip.
- */
-
- asm volatile(
- "movl %0,%%esp\n\t"
- "jmp *%1"
- :
- :"r" (current->thread.esp),"r" (current->thread.eip));
-}
-
-extern struct {
- void * esp;
- unsigned short ss;
-} stack_start;
-
-#ifdef CONFIG_NUMA
-
-/* which logical CPUs are on which nodes */
-cpumask_t node_2_cpu_mask[MAX_NUMNODES] =
- { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
-/* which node each logical CPU is on */
-int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 };
-EXPORT_SYMBOL(cpu_2_node);
-
-/* set up a mapping between cpu and node. */
-static inline void map_cpu_to_node(int cpu, int node)
-{
- printk("Mapping cpu %d to node %d\n", cpu, node);
- cpu_set(cpu, node_2_cpu_mask[node]);
- cpu_2_node[cpu] = node;
-}
-
-/* undo a mapping between cpu and node. */
-static inline void unmap_cpu_to_node(int cpu)
-{
- int node;
-
- printk("Unmapping cpu %d from all nodes\n", cpu);
- for (node = 0; node < MAX_NUMNODES; node ++)
- cpu_clear(cpu, node_2_cpu_mask[node]);
- cpu_2_node[cpu] = 0;
-}
-#else /* !CONFIG_NUMA */
-
-#define map_cpu_to_node(cpu, node) ({})
-#define unmap_cpu_to_node(cpu) ({})
-
-#endif /* CONFIG_NUMA */
-
-u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-
-void map_cpu_to_logical_apicid(void)
-{
- int cpu = smp_processor_id();
- int apicid = smp_processor_id();
-
- cpu_2_logical_apicid[cpu] = apicid;
- map_cpu_to_node(cpu, apicid_to_node(apicid));
-}
-
-void unmap_cpu_to_logical_apicid(int cpu)
-{
- cpu_2_logical_apicid[cpu] = BAD_APICID;
- unmap_cpu_to_node(cpu);
-}
-
-#if APIC_DEBUG
-static inline void __inquire_remote_apic(int apicid)
-{
- int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
- char *names[] = { "ID", "VERSION", "SPIV" };
- int timeout, status;
-
- printk("Inquiring remote APIC #%d...\n", apicid);
-
- for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
- printk("... APIC #%d %s: ", apicid, names[i]);
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
-
- apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
- apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
-
- timeout = 0;
- do {
- udelay(100);
- status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
- } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
-
- switch (status) {
- case APIC_ICR_RR_VALID:
- status = apic_read(APIC_RRR);
- printk("%08x\n", status);
- break;
- default:
- printk("failed\n");
- }
- }
-}
-#endif
-
-#if 0
-#ifdef WAKE_SECONDARY_VIA_NMI
-/*
- * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
- * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
- * won't ... remember to clear down the APIC, etc later.
- */
-static int __init
-wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
-{
- unsigned long send_status = 0, accept_status = 0;
- int timeout, maxlvt;
-
- /* Target chip */
- apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
-
- /* Boot on the stack */
- /* Kick the second */
- apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
-
- Dprintk("Waiting for send to finish...\n");
- timeout = 0;
- do {
- Dprintk("+");
- udelay(100);
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- } while (send_status && (timeout++ < 1000));
-
- /*
- * Give the other CPU some time to accept the IPI.
- */
- udelay(200);
- /*
- * Due to the Pentium erratum 3AP.
- */
- maxlvt = get_maxlvt();
- if (maxlvt > 3) {
- apic_read_around(APIC_SPIV);
- apic_write(APIC_ESR, 0);
- }
- accept_status = (apic_read(APIC_ESR) & 0xEF);
- Dprintk("NMI sent.\n");
-
- if (send_status)
- printk("APIC never delivered???\n");
- if (accept_status)
- printk("APIC delivery error (%lx).\n", accept_status);
-
- return (send_status | accept_status);
-}
-#endif /* WAKE_SECONDARY_VIA_NMI */
-
-#ifdef WAKE_SECONDARY_VIA_INIT
-static int __init
-wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
-{
- unsigned long send_status = 0, accept_status = 0;
- int maxlvt, timeout, num_starts, j;
-
- /*
- * Be paranoid about clearing APIC errors.
- */
- if (APIC_INTEGRATED(apic_version[phys_apicid])) {
- apic_read_around(APIC_SPIV);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- }
-
- Dprintk("Asserting INIT.\n");
-
- /*
- * Turn INIT on target chip
- */
- apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
- /*
- * Send IPI
- */
- apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
- | APIC_DM_INIT);
-
- Dprintk("Waiting for send to finish...\n");
- timeout = 0;
- do {
- Dprintk("+");
- udelay(100);
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- } while (send_status && (timeout++ < 1000));
-
- mdelay(10);
-
- Dprintk("Deasserting INIT.\n");
-
- /* Target chip */
- apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
- /* Send IPI */
- apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
-
- Dprintk("Waiting for send to finish...\n");
- timeout = 0;
- do {
- Dprintk("+");
- udelay(100);
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- } while (send_status && (timeout++ < 1000));
-
- atomic_set(&init_deasserted, 1);
-
- /*
- * Should we send STARTUP IPIs ?
- *
- * Determine this based on the APIC version.
- * If we don't have an integrated APIC, don't send the STARTUP IPIs.
- */
- if (APIC_INTEGRATED(apic_version[phys_apicid]))
- num_starts = 2;
- else
- num_starts = 0;
-
- /*
- * Run STARTUP IPI loop.
- */
- Dprintk("#startup loops: %d.\n", num_starts);
-
- maxlvt = get_maxlvt();
-
- for (j = 1; j <= num_starts; j++) {
- Dprintk("Sending STARTUP #%d.\n",j);
- apic_read_around(APIC_SPIV);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- Dprintk("After apic_write.\n");
-
- /*
- * STARTUP IPI
- */
-
- /* Target chip */
- apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
- /* Boot on the stack */
- /* Kick the second */
- apic_write_around(APIC_ICR, APIC_DM_STARTUP
- | (start_eip >> 12));
-
- /*
- * Give the other CPU some time to accept the IPI.
- */
- udelay(300);
-
- Dprintk("Startup point 1.\n");
-
- Dprintk("Waiting for send to finish...\n");
- timeout = 0;
- do {
- Dprintk("+");
- udelay(100);
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- } while (send_status && (timeout++ < 1000));
-
- /*
- * Give the other CPU some time to accept the IPI.
- */
- udelay(200);
- /*
- * Due to the Pentium erratum 3AP.
- */
- if (maxlvt > 3) {
- apic_read_around(APIC_SPIV);
- apic_write(APIC_ESR, 0);
- }
- accept_status = (apic_read(APIC_ESR) & 0xEF);
- if (send_status || accept_status)
- break;
- }
- Dprintk("After Startup.\n");
-
- if (send_status)
- printk("APIC never delivered???\n");
- if (accept_status)
- printk("APIC delivery error (%lx).\n", accept_status);
-
- return (send_status | accept_status);
-}
-#endif /* WAKE_SECONDARY_VIA_INIT */
-#endif
-
-extern cpumask_t cpu_initialized;
-
-static int __init do_boot_cpu(int apicid)
-/*
- * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
- * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
- * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
- */
-{
- struct task_struct *idle;
- unsigned long boot_error;
- int timeout, cpu;
- unsigned long start_eip;
-#if 0
- unsigned short nmi_high = 0, nmi_low = 0;
-#endif
- full_execution_context_t ctxt;
- extern void startup_32_smp(void);
- extern void hypervisor_callback(void);
- extern void failsafe_callback(void);
- extern int smp_trap_init(trap_info_t *);
- int i;
-
- cpu = ++cpucount;
- /*
- * We can't use kernel_thread since we must avoid to
- * reschedule the child.
- */
- idle = fork_idle(cpu);
- if (IS_ERR(idle))
- panic("failed fork for CPU %d", cpu);
- idle->thread.eip = (unsigned long) start_secondary;
- /* start_eip had better be page-aligned! */
- start_eip = (unsigned long)startup_32_smp;
-
- /* So we see what's up */
- printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
- /* Stack for startup_32 can be just as for start_secondary onwards */
- stack_start.esp = (void *) idle->thread.esp;
-
- irq_ctx_init(cpu);
-
- /*
- * This grunge runs the startup process for
- * the targeted processor.
- */
-
- atomic_set(&init_deasserted, 0);
-
-#if 1
- if (cpu_gdt_descr[0].size > PAGE_SIZE)
- BUG();
- cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
- memcpy((void *)cpu_gdt_descr[cpu].address,
- (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size);
- memset((char *)cpu_gdt_descr[cpu].address +
- FIRST_RESERVED_GDT_ENTRY * 8, 0,
- NR_RESERVED_GDT_ENTRIES * 8);
-
- memset(&ctxt, 0, sizeof(ctxt));
-
- ctxt.cpu_ctxt.ds = __USER_DS;
- ctxt.cpu_ctxt.es = __USER_DS;
- ctxt.cpu_ctxt.fs = 0;
- ctxt.cpu_ctxt.gs = 0;
- ctxt.cpu_ctxt.ss = __KERNEL_DS;
- ctxt.cpu_ctxt.cs = __KERNEL_CS;
- ctxt.cpu_ctxt.eip = start_eip;
- ctxt.cpu_ctxt.esp = idle->thread.esp;
- ctxt.cpu_ctxt.eflags = (1<<9) | (1<<2) | (idle->thread.io_pl<<12);
-
- /* FPU is set up to default initial state. */
- memset(ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
-
- /* Virtual IDT is empty at start-of-day. */
- for ( i = 0; i < 256; i++ )
- {
- ctxt.trap_ctxt[i].vector = i;
- ctxt.trap_ctxt[i].cs = FLAT_GUESTOS_CS;
- }
- ctxt.fast_trap_idx = smp_trap_init(ctxt.trap_ctxt);
-
- /* No LDT. */
- ctxt.ldt_ents = 0;
-
- {
- unsigned long va;
- int f;
-
- for (va = cpu_gdt_descr[cpu].address, f = 0;
- va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size;
- va += PAGE_SIZE, f++) {
- ctxt.gdt_frames[f] = virt_to_machine(va) >> PAGE_SHIFT;
- make_page_readonly((void *)va);
- }
- ctxt.gdt_ents = cpu_gdt_descr[cpu].size / 8;
- flush_page_update_queue();
- }
-
- /* Ring 1 stack is the initial stack. */
- ctxt.guestos_ss = __KERNEL_DS;
- ctxt.guestos_esp = idle->thread.esp;
-
- /* Callback handlers. */
- ctxt.event_callback_cs = __KERNEL_CS;
- ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
- ctxt.failsafe_callback_cs = __KERNEL_CS;
- ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
-
- ctxt.pt_base = (unsigned long)virt_to_machine(swapper_pg_dir);
-
- boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt);
-
- if (!boot_error) {
- /*
- * allow APs to start initializing.
- */
- Dprintk("Before Callout %d.\n", cpu);
- cpu_set(cpu, cpu_callout_map);
- Dprintk("After Callout %d.\n", cpu);
-
- /*
- * Wait 5s total for a response
- */
- for (timeout = 0; timeout < 50000; timeout++) {
- if (cpu_isset(cpu, cpu_callin_map))
- break; /* It has booted */
- udelay(100);
- }
-
- if (cpu_isset(cpu, cpu_callin_map)) {
- /* number CPUs logically, starting from 1 (BSP is 0) */
- Dprintk("OK.\n");
- printk("CPU%d: ", cpu);
- print_cpu_info(&cpu_data[cpu]);
- Dprintk("CPU has booted.\n");
- } else {
- boot_error= 1;
- }
- }
- x86_cpu_to_apicid[cpu] = apicid;
- if (boot_error) {
- /* Try to put things back the way they were before ... */
- unmap_cpu_to_logical_apicid(cpu);
- cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
- cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
- cpucount--;
- }
-
-#else
- Dprintk("Setting warm reset code and vector.\n");
-
- store_NMI_vector(&nmi_high, &nmi_low);
-
- smpboot_setup_warm_reset_vector(start_eip);
-
- /*
- * Starting actual IPI sequence...
- */
- boot_error = wakeup_secondary_cpu(apicid, start_eip);
-
- if (!boot_error) {
- /*
- * allow APs to start initializing.
- */
- Dprintk("Before Callout %d.\n", cpu);
- cpu_set(cpu, cpu_callout_map);
- Dprintk("After Callout %d.\n", cpu);
-
- /*
- * Wait 5s total for a response
- */
- for (timeout = 0; timeout < 50000; timeout++) {
- if (cpu_isset(cpu, cpu_callin_map))
- break; /* It has booted */
- udelay(100);
- }
-
- if (cpu_isset(cpu, cpu_callin_map)) {
- /* number CPUs logically, starting from 1 (BSP is 0) */
- Dprintk("OK.\n");
- printk("CPU%d: ", cpu);
- print_cpu_info(&cpu_data[cpu]);
- Dprintk("CPU has booted.\n");
- } else {
- boot_error= 1;
- if (*((volatile unsigned char *)trampoline_base)
- == 0xA5)
- /* trampoline started but...? */
- printk("Stuck ??\n");
- else
- /* trampoline code not run */
- printk("Not responding.\n");
- inquire_remote_apic(apicid);
- }
- }
- x86_cpu_to_apicid[cpu] = apicid;
- if (boot_error) {
- /* Try to put things back the way they were before ... */
- unmap_cpu_to_logical_apicid(cpu);
- cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
- cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
- cpucount--;
- }
-
- /* mark "stuck" area as not stuck */
- *((volatile unsigned long *)trampoline_base) = 0;
-#endif
-
- return boot_error;
-}
-
-cycles_t cacheflush_time;
-unsigned long cache_decay_ticks;
-
-static void smp_tune_scheduling (void)
-{
- unsigned long cachesize; /* kB */
- unsigned long bandwidth = 350; /* MB/s */
- /*
- * Rough estimation for SMP scheduling, this is the number of
- * cycles it takes for a fully memory-limited process to flush
- * the SMP-local cache.
- *
- * (For a P5 this pretty much means we will choose another idle
- * CPU almost always at wakeup time (this is due to the small
- * L1 cache), on PIIs it's around 50-100 usecs, depending on
- * the cache size)
- */
-
- if (!cpu_khz) {
- /*
- * this basically disables processor-affinity
- * scheduling on SMP without a TSC.
- */
- cacheflush_time = 0;
- return;
- } else {
- cachesize = boot_cpu_data.x86_cache_size;
- if (cachesize == -1) {
- cachesize = 16; /* Pentiums, 2x8kB cache */
- bandwidth = 100;
- }
-
- cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
- }
-
- cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1;
-
- printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
- (long)cacheflush_time/(cpu_khz/1000),
- ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
- printk("task migration cache decay timeout: %ld msecs.\n",
- cache_decay_ticks);
-}
-
-/*
- * Cycle through the processors sending APIC IPIs to boot each.
- */
-
-#if 0
-static int boot_cpu_logical_apicid;
-#endif
-/* Where the IO area was mapped on multiquad, always 0 otherwise */
-void *xquad_portio;
-
-cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
-
-static void __init smp_boot_cpus(unsigned int max_cpus)
-{
- int cpu, kicked;
- unsigned long bogosum = 0;
-#if 0
- int apicid, bit;
-#endif
-
- /*
- * Setup boot CPU information
- */
- smp_store_cpu_info(0); /* Final full version of the data */
- printk("CPU%d: ", 0);
- print_cpu_info(&cpu_data[0]);
-
-#if 0
- boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
- boot_cpu_logical_apicid = logical_smp_processor_id();
- x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
-#else
- // boot_cpu_physical_apicid = 0;
- // boot_cpu_logical_apicid = 0;
- x86_cpu_to_apicid[0] = 0;
-#endif
-
- current_thread_info()->cpu = 0;
- smp_tune_scheduling();
- cpus_clear(cpu_sibling_map[0]);
- cpu_set(0, cpu_sibling_map[0]);
-
- /*
- * If we couldn't find an SMP configuration at boot time,
- * get out of here now!
- */
- if (!smp_found_config /* && !acpi_lapic) */) {
- printk(KERN_NOTICE "SMP motherboard not detected.\n");
- smpboot_clear_io_apic_irqs();
-#if 0
- phys_cpu_present_map = physid_mask_of_physid(0);
- if (APIC_init_uniprocessor())
- printk(KERN_NOTICE "Local APIC not detected."
- " Using dummy APIC emulation.\n");
-#endif
- map_cpu_to_logical_apicid();
- return;
- }
-
-#if 0
- /*
- * Should not be necessary because the MP table should list the boot
- * CPU too, but we do it for the sake of robustness anyway.
- * Makes no sense to do this check in clustered apic mode, so skip it
- */
- if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
- printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
- boot_cpu_physical_apicid);
- physid_set(hard_smp_processor_id(), phys_cpu_present_map);
- }
-
- /*
- * If we couldn't find a local APIC, then get out of here now!
- */
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
- printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_physical_apicid);
- printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
- smpboot_clear_io_apic_irqs();
- phys_cpu_present_map = physid_mask_of_physid(0);
- return;
- }
-
- verify_local_APIC();
-#endif
-
- /*
- * If SMP should be disabled, then really disable it!
- */
- if (!max_cpus) {
- HYPERVISOR_shared_info->n_vcpu = 1;
- printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
- smpboot_clear_io_apic_irqs();
-#if 0
- phys_cpu_present_map = physid_mask_of_physid(0);
-#endif
- return;
- }
-
- smp_intr_init();
-
-#if 0
- connect_bsp_APIC();
- setup_local_APIC();
-#endif
- map_cpu_to_logical_apicid();
-#if 0
-
-
- setup_portio_remap();
-
- /*
- * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
- *
- * In clustered apic mode, phys_cpu_present_map is a constructed thus:
- * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the
- * clustered apic ID.
- */
- Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
-#endif
- Dprintk("CPU present map: %lx\n",
- (1UL << HYPERVISOR_shared_info->n_vcpu) - 1);
-
- kicked = 1;
- for (cpu = 1; kicked < NR_CPUS &&
- cpu < HYPERVISOR_shared_info->n_vcpu; cpu++) {
- if (max_cpus <= cpucount+1)
- continue;
-
- if (do_boot_cpu(cpu))
- printk("CPU #%d not responding - cannot use it.\n",
- cpu);
- else
- ++kicked;
- }
-
-#if 0
- /*
- * Cleanup possible dangling ends...
- */
- smpboot_restore_warm_reset_vector();
-#endif
-
- /*
- * Allow the user to impress friends.
- */
- Dprintk("Before bogomips.\n");
- for (cpu = 0; cpu < NR_CPUS; cpu++)
- if (cpu_isset(cpu, cpu_callout_map))
- bogosum += cpu_data[cpu].loops_per_jiffy;
- printk(KERN_INFO
- "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
- cpucount+1,
- bogosum/(500000/HZ),
- (bogosum/(5000/HZ))%100);
-
- Dprintk("Before bogocount - setting activated=1.\n");
-
- if (smp_b_stepping)
- printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
-
- /*
- * Don't taint if we are running SMP kernel on a single non-MP
- * approved Athlon
- */
- if (tainted & TAINT_UNSAFE_SMP) {
- if (cpucount)
- printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
- else
- tainted &= ~TAINT_UNSAFE_SMP;
- }
-
- Dprintk("Boot done.\n");
-
- /*
- * construct cpu_sibling_map[], so that we can tell sibling CPUs
- * efficiently.
- */
- for (cpu = 0; cpu < NR_CPUS; cpu++)
- cpus_clear(cpu_sibling_map[cpu]);
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- int siblings = 0;
- int i;
- if (!cpu_isset(cpu, cpu_callout_map))
- continue;
-
- if (smp_num_siblings > 1) {
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_isset(i, cpu_callout_map))
- continue;
- if (phys_proc_id[cpu] == phys_proc_id[i]) {
- siblings++;
- cpu_set(i, cpu_sibling_map[cpu]);
- }
- }
- } else {
- siblings++;
- cpu_set(cpu, cpu_sibling_map[cpu]);
- }
-
- if (siblings != smp_num_siblings)
- printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
- }
-
-#if 0
- if (nmi_watchdog == NMI_LOCAL_APIC)
- check_nmi_watchdog();
-
- smpboot_setup_io_apic();
-
- setup_boot_APIC_clock();
-
- /*
- * Synchronize the TSC with the AP
- */
- if (cpu_has_tsc && cpucount && cpu_khz)
- synchronize_tsc_bp();
-#endif
-}
-
-/* These are wrappers to interface to the new boot process. Someone
- who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
-void __init smp_prepare_cpus(unsigned int max_cpus)
-{
- smp_boot_cpus(max_cpus);
-}
-
-void __devinit smp_prepare_boot_cpu(void)
-{
- cpu_set(smp_processor_id(), cpu_online_map);
- cpu_set(smp_processor_id(), cpu_callout_map);
-}
-
-int __devinit __cpu_up(unsigned int cpu)
-{
- /* This only works at boot for x86. See "rewrite" above. */
- if (cpu_isset(cpu, smp_commenced_mask)) {
- local_irq_enable();
- return -ENOSYS;
- }
-
- /* In case one didn't come up */
- if (!cpu_isset(cpu, cpu_callin_map)) {
- local_irq_enable();
- return -EIO;
- }
-
- local_irq_enable();
- /* Unleash the CPU! */
- cpu_set(cpu, smp_commenced_mask);
- while (!cpu_isset(cpu, cpu_online_map))
- mb();
- return 0;
-}
-
-void __init smp_cpus_done(unsigned int max_cpus)
-{
-#if 1
-#else
-#ifdef CONFIG_X86_IO_APIC
- setup_ioapic_dest();
-#endif
- zap_low_mappings();
- /*
- * Disable executability of the SMP trampoline:
- */
- set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
-#endif
-}
-
-extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
-
-static struct irqaction reschedule_irq = {
- smp_reschedule_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "reschedule",
- NULL, NULL
-};
-
-extern irqreturn_t smp_invalidate_interrupt(int, void *, struct pt_regs *);
-
-static struct irqaction invalidate_irq = {
- smp_invalidate_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "invalidate",
- NULL, NULL
-};
-
-extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
-
-static struct irqaction call_function_irq = {
- smp_call_function_interrupt, SA_INTERRUPT, CPU_MASK_NONE,
- "call_function", NULL, NULL
-};
-
-void __init smp_intr_init(void)
-{
-
- (void)setup_irq(
- bind_ipi_on_cpu_to_irq(smp_processor_id(), RESCHEDULE_VECTOR),
- &reschedule_irq);
- (void)setup_irq(
- bind_ipi_on_cpu_to_irq(smp_processor_id(), INVALIDATE_TLB_VECTOR),
- &invalidate_irq);
- (void)setup_irq(
- bind_ipi_on_cpu_to_irq(smp_processor_id(), CALL_FUNCTION_VECTOR),
- &call_function_irq);
-}
+++ /dev/null
-/* Copyright (C) 2004, Christian Limpach */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/threads.h>
-
-unsigned int __initdata maxcpus = NR_CPUS;
-
-
-/*
- * the frequency of the profiling timer can be changed
- * by writing a multiplier value into /proc/profile.
- */
-int setup_profiling_timer(unsigned int multiplier)
-{
- printk("setup_profiling_timer\n");
-
- return 0;
-}
+++ /dev/null
-
-obj-y := blktap_userdev.o blktap_datapath.o blktap_controlmsg.o blktap.o
-
+++ /dev/null
-/******************************************************************************
- * blktap.c
- *
- * XenLinux virtual block-device tap.
- *
- * Copyright (c) 2004, Andrew Warfield
- *
- * Based on the original split block driver:
- * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
- * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
- * Copyright (c) 2004, Christian Limpach
- *
- * Note that unlike the split block driver code, this driver has been developed
- * strictly for Linux 2.6
- */
-
-#include "blktap.h"
-
-int __init xlblk_init(void)
-{
- ctrl_msg_t cmsg;
- blkif_fe_driver_status_t fe_st;
- blkif_be_driver_status_t be_st;
-
- printk(KERN_INFO "Initialising Xen block tap device\n");
-
- DPRINTK(" tap - Backend connection init:\n");
-
-
- (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
- CALLBACK_IN_BLOCKING_CONTEXT);
-
- /* Send a driver-UP notification to the domain controller. */
- cmsg.type = CMSG_BLKIF_FE;
- cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS;
- cmsg.length = sizeof(blkif_fe_driver_status_t);
- fe_st.status = BLKIF_DRIVER_STATUS_UP;
- memcpy(cmsg.msg, &fe_st, sizeof(fe_st));
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-
- DPRINTK(" tap - Frontend connection init:\n");
-
- active_reqs_init();
-
- ptfe_blkif.status = DISCONNECTED;
-
- (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx,
- CALLBACK_IN_BLOCKING_CONTEXT);
-
- /* Send a driver-UP notification to the domain controller. */
- cmsg.type = CMSG_BLKIF_BE;
- cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS;
- cmsg.length = sizeof(blkif_be_driver_status_t);
- be_st.status = BLKIF_DRIVER_STATUS_UP;
- memcpy(cmsg.msg, &be_st, sizeof(be_st));
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-
- DPRINTK(" tap - Userland channel init:\n");
-
- blktap_init();
-
- DPRINTK("Blkif tap device initialized.\n");
-
- return 0;
-}
-
-void blkdev_suspend(void)
-{
-}
-
-void blkdev_resume(void)
-{
- ctrl_msg_t cmsg;
- blkif_fe_driver_status_t st;
-
- /* Send a driver-UP notification to the domain controller. */
- cmsg.type = CMSG_BLKIF_FE;
- cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS;
- cmsg.length = sizeof(blkif_fe_driver_status_t);
- st.status = BLKIF_DRIVER_STATUS_UP;
- memcpy(cmsg.msg, &st, sizeof(st));
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-}
-
-
-__initcall(xlblk_init);
+++ /dev/null
-/*
- * blktap.h
- *
- * Interfaces for the Xen block tap driver.
- *
- * (c) 2004, Andrew Warfield, University of Cambridge
- *
- */
-
-#ifndef __BLKTAP_H__
-#define __BLKTAP_H__
-
-#include <linux/version.h>
-#include <linux/blkdev.h>
-#include <linux/config.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <asm-xen/ctrl_if.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-#include <asm/io.h>
-#include <asm/setup.h>
-#include <asm/pgalloc.h>
-#include <asm-xen/hypervisor.h>
-#include <asm-xen/xen-public/io/blkif.h>
-
-/* -------[ debug / pretty printing ]--------------------------------- */
-
-#if 0
-#define ASSERT(_p) \
- if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
- __LINE__, __FILE__); *(int*)0=0; }
-#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
- __FILE__ , __LINE__ , ## _a )
-#else
-#define ASSERT(_p) ((void)0)
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
-#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
-
-/* -------[ connection / request tracking ]--------------------------- */
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-#define VMALLOC_VMADDR(x) ((unsigned long)(x))
-#endif
-
-extern spinlock_t blkif_io_lock;
-
-typedef struct blkif_st {
- /* Unique identifier for this interface. */
- domid_t domid;
- unsigned int handle;
- /* Physical parameters of the comms window. */
- unsigned long shmem_frame;
- unsigned int evtchn;
- int irq;
- /* Comms information. */
- blkif_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
- BLKIF_RING_IDX blk_req_cons; /* Request consumer. */
- BLKIF_RING_IDX blk_resp_prod; /* Private version of resp. producer. */
-
- enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
- /*
- * DISCONNECT response is deferred until pending requests are ack'ed.
- * We therefore need to store the id from the original request.
- */ u8 disconnect_rspid;
- struct blkif_st *hash_next;
- struct list_head blkdev_list;
- spinlock_t blk_ring_lock;
- atomic_t refcnt;
-
- struct work_struct work;
-} blkif_t;
-
-typedef struct {
- blkif_t *blkif;
- unsigned long id;
- int nr_pages;
- unsigned long mach_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- unsigned long virt_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- int next_free;
-} active_req_t;
-
-
-/* -------[ block ring structs ]-------------------------------------- */
-
-/* Types of ring. */
-#define BLKIF_REQ_RING_TYPE 1
-#define BLKIF_RSP_RING_TYPE 2
-
-/* generic ring struct. */
-typedef struct blkif_generic_ring_struct {
- int type;
-} blkif_generic_ring_t;
-
-/* A requestor's view of a ring. */
-typedef struct blkif_req_ring_struct {
-
- int type; /* Will be BLKIF_REQ_RING_TYPE */
- BLKIF_RING_IDX req_prod; /* PRIVATE req_prod index */
- BLKIF_RING_IDX rsp_cons; /* Response consumer index */
- blkif_ring_t *ring; /* Pointer to shared ring struct */
-
-} blkif_req_ring_t;
-
-#define BLKIF_REQ_RING_INIT { BLKIF_REQ_RING_TYPE, 0, 0, 0 }
-
-/* A responder's view of a ring. */
-typedef struct blkif_rsp_ring_struct {
-
- int type;
- BLKIF_RING_IDX rsp_prod; /* PRIVATE rsp_prod index */
- BLKIF_RING_IDX req_cons; /* Request consumer index */
- blkif_ring_t *ring; /* Pointer to shared ring struct */
-
-} blkif_rsp_ring_t;
-
-#define BLKIF_RSP_RING_INIT = { BLKIF_RSP_RING_TYPE, 0, 0, 0 }
-
-#define RING(a) (blkif_generic_ring_t *)(a)
-
-inline int BLKTAP_RING_FULL(blkif_generic_ring_t *ring);
-
-
-/* -------[ interposition -> character device interface ]------------- */
-
-/* /dev/xen/blktap resides at device number major=10, minor=200 */
-#define BLKTAP_MINOR 202
-
-/* size of the extra VMA area to map in attached pages. */
-#define BLKTAP_VMA_PAGES BLKIF_RING_SIZE
-
-/* blktap IOCTLs: */
-#define BLKTAP_IOCTL_KICK_FE 1
-#define BLKTAP_IOCTL_KICK_BE 2
-#define BLKTAP_IOCTL_SETMODE 3
-
-/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
-#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
-#define BLKTAP_MODE_INTERCEPT_FE 0x00000001
-#define BLKTAP_MODE_INTERCEPT_BE 0x00000002
-#define BLKTAP_MODE_COPY_FE 0x00000004
-#define BLKTAP_MODE_COPY_BE 0x00000008
-#define BLKTAP_MODE_COPY_FE_PAGES 0x00000010
-#define BLKTAP_MODE_COPY_BE_PAGES 0x00000020
-
-#define BLKTAP_MODE_INTERPOSE \
- (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
-
-#define BLKTAP_MODE_COPY_BOTH \
- (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE)
-
-#define BLKTAP_MODE_COPY_BOTH_PAGES \
- (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES)
-
-static inline int BLKTAP_MODE_VALID(unsigned long arg)
-{
- return (
- ( arg == BLKTAP_MODE_PASSTHROUGH ) ||
- ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
- ( arg == BLKTAP_MODE_INTERCEPT_BE ) ||
- ( arg == BLKTAP_MODE_INTERPOSE ) ||
- ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) ||
- ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
- ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
- );
-}
-
-
-
-/* -------[ Mappings to User VMA ]------------------------------------ */
-#define MAX_PENDING_REQS 64
-#define BATCH_PER_DOMAIN 16
-extern struct vm_area_struct *blktap_vma;
-
-/* The following are from blkback.c and should probably be put in a
- * header and included from there.
- * The mmap area described here is where attached data pages eill be mapped.
- */
-
-extern unsigned long mmap_vstart;
-#define MMAP_PAGES_PER_REQUEST \
- (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
-#define MMAP_PAGES \
- (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
-#define MMAP_VADDR(_req,_seg) \
- (mmap_vstart + \
- ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
- ((_seg) * PAGE_SIZE))
-
-/* immediately before the mmap area, we have a bunch of pages reserved
- * for shared memory rings.
- */
-
-#define RING_PAGES 128
-extern unsigned long rings_vstart;
-
-/* -------[ Here be globals ]----------------------------------------- */
-
-extern unsigned long blktap_mode;
-
-
-/* blkif struct, containing ring to FE domain */
-extern blkif_t ptfe_blkif;
-
-/* Connection to a single backend domain. */
-extern blkif_ring_t *blk_ptbe_ring; /* Ring from the PT to the BE dom */
-extern BLKIF_RING_IDX ptbe_resp_cons; /* Response consumer for comms ring. */
-extern BLKIF_RING_IDX ptbe_req_prod; /* Private request producer. */
-
-/* Rings up to user space. */
-extern blkif_req_ring_t fe_ring;// = BLKIF_REQ_RING_INIT;
-extern blkif_rsp_ring_t be_ring;// = BLKIF_RSP_RING_INIT;
-
-/* Event channel to backend domain. */
-extern unsigned int blkif_ptbe_evtchn;
-
-/* User ring status... this will soon vanish into a ring struct. */
-extern unsigned long blktap_ring_ok;
-
-/* -------[ ...and function prototypes. ]----------------------------- */
-
-/* init function for character device interface. */
-int blktap_init(void);
-
-/* interfaces to the char driver, passing messages to and from apps. */
-void blktap_kick_user(void);
-int blktap_write_to_ring(blkif_request_t *req);
-
-
-/* user ring access functions: */
-int blktap_write_fe_ring(blkif_request_t *req);
-int blktap_write_be_ring(blkif_response_t *rsp);
-int blktap_read_fe_ring(void);
-int blktap_read_be_ring(void);
-
-/* and the helpers they call: */
-inline int write_resp_to_fe_ring(blkif_response_t *rsp);
-inline void kick_fe_domain(void);
-
-inline int write_req_to_be_ring(blkif_request_t *req);
-inline void kick_be_domain(void);
-
-/* Interrupt handlers. */
-irqreturn_t blkif_ptbe_int(int irq, void *dev_id,
- struct pt_regs *ptregs);
-irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs);
-
-/* Control message receiver. */
-extern void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id);
-
-#define __BLKINT_H__
-#endif
+++ /dev/null
-/******************************************************************************
- * blktap_controlmsg.c
- *
- * XenLinux virtual block-device tap.
- * Control interfaces to the frontend and backend drivers.
- *
- * Copyright (c) 2004, Andrew Warfield
- *
- */
-
-#include "blktap.h"
-
-#define BLKIF_STATE_CLOSED 0
-#define BLKIF_STATE_DISCONNECTED 1
-#define BLKIF_STATE_CONNECTED 2
-
-static char *blkif_state_name[] = {
- [BLKIF_STATE_CLOSED] = "closed",
- [BLKIF_STATE_DISCONNECTED] = "disconnected",
- [BLKIF_STATE_CONNECTED] = "connected",
-};
-
-static char * blkif_status_name[] = {
- [BLKIF_INTERFACE_STATUS_CLOSED] = "closed",
- [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
- [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected",
- [BLKIF_INTERFACE_STATUS_CHANGED] = "changed",
-};
-static unsigned int blkif_pt_state = BLKIF_STATE_CLOSED;
-static unsigned blkif_ptbe_irq;
-unsigned int blkif_ptbe_evtchn;
-
-/*-----[ Control Messages to/from Frontend VMs ]--------------------------*/
-
-
-void blkif_ptfe_create(blkif_be_create_t *create)
-{
- blkif_t *blkif;
- domid_t domid = create->domid;
- unsigned int handle = create->blkif_handle;
-
-
- /* May want to store info on the connecting domain here. */
-
- DPRINTK("PT got BE_CREATE\n");
- blkif = &ptfe_blkif; /* for convenience if the hash is readded later. */
-
- /* blkif struct init code from blkback.c */
- memset(blkif, 0, sizeof(*blkif));
- blkif->domid = domid;
- blkif->handle = handle;
- blkif->status = DISCONNECTED;
- spin_lock_init(&blkif->blk_ring_lock);
- atomic_set(&blkif->refcnt, 0);
-
- create->status = BLKIF_BE_STATUS_OKAY;
-}
-
-
-void blkif_ptfe_destroy(blkif_be_destroy_t *destroy)
-{
- /* Clear anything that we initialized above. */
-
- DPRINTK("PT got BE_DESTROY\n");
- destroy->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void blkif_ptfe_connect(blkif_be_connect_t *connect)
-{
- domid_t domid = connect->domid;
- /*unsigned int handle = connect->blkif_handle;*/
- unsigned int evtchn = connect->evtchn;
- unsigned long shmem_frame = connect->shmem_frame;
- struct vm_struct *vma;
- pgprot_t prot;
- int error;
- blkif_t *blkif;
-
- DPRINTK("PT got BE_CONNECT\n");
-
- blkif = &ptfe_blkif; /* for convenience if the hash is readded later. */
-
- if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
- {
- connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
- return;
- }
-
- prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
- error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
- shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
- prot, domid);
- if ( error != 0 )
- {
- WPRINTK("BE_CONNECT: error! (%d)\n", error);
- if ( error == -ENOMEM )
- connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
- else if ( error == -EFAULT ) {
- connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
- WPRINTK("BE_CONNECT: MAPPING error!\n");
- }
- else
- connect->status = BLKIF_BE_STATUS_ERROR;
- vfree(vma->addr);
- return;
- }
-
- if ( blkif->status != DISCONNECTED )
- {
- connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
- vfree(vma->addr);
- return;
- }
-
- blkif->evtchn = evtchn;
- blkif->irq = bind_evtchn_to_irq(evtchn);
- blkif->shmem_frame = shmem_frame;
- blkif->blk_ring_base = (blkif_ring_t *)vma->addr;
- blkif->status = CONNECTED;
- /*blkif_get(blkif);*/
-
- request_irq(blkif->irq, blkif_ptfe_int, 0, "blkif-pt-backend", blkif);
-
- connect->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void blkif_ptfe_disconnect(blkif_be_disconnect_t *disconnect)
-{
- /*
- * don't actually set the passthrough to disconnected.
- * We just act as a pipe, and defer to the real ends to handle things like
- * recovery.
- */
-
- DPRINTK("PT got BE_DISCONNECT\n");
-
- disconnect->status = BLKIF_BE_STATUS_OKAY;
- return;
-}
-
-/*-----[ Control Messages to/from Backend VM ]----------------------------*/
-
-/* Tell the controller to bring up the interface. */
-static void blkif_ptbe_send_interface_connect(void)
-{
- ctrl_msg_t cmsg = {
- .type = CMSG_BLKIF_FE,
- .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
- .length = sizeof(blkif_fe_interface_connect_t),
- };
- blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
- msg->handle = 0;
- msg->shmem_frame = virt_to_machine(blk_ptbe_ring) >> PAGE_SHIFT;
-
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-}
-
-static void blkif_ptbe_close(void)
-{
-}
-
-/* Move from CLOSED to DISCONNECTED state. */
-static void blkif_ptbe_disconnect(void)
-{
- blk_ptbe_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
- blk_ptbe_ring->req_prod = blk_ptbe_ring->resp_prod
- = ptbe_resp_cons = ptbe_req_prod = 0;
- blkif_pt_state = BLKIF_STATE_DISCONNECTED;
- DPRINTK("Blkif-Passthrough-BE is now DISCONNECTED.\n");
- blkif_ptbe_send_interface_connect();
-}
-
-static void blkif_ptbe_connect(blkif_fe_interface_status_t *status)
-{
- int err = 0;
-
- blkif_ptbe_evtchn = status->evtchn;
- blkif_ptbe_irq = bind_evtchn_to_irq(blkif_ptbe_evtchn);
-
- err = request_irq(blkif_ptbe_irq, blkif_ptbe_int,
- SA_SAMPLE_RANDOM, "blkif", NULL);
- if ( err ) {
- WPRINTK("blkfront request_irq failed (%d)\n", err);
- return;
- } else {
- /* transtion to connected in case we need to do a
- a partion probe on a whole disk */
- blkif_pt_state = BLKIF_STATE_CONNECTED;
- }
-}
-
-static void unexpected(blkif_fe_interface_status_t *status)
-{
- WPRINTK(" TAP: Unexpected blkif status %s in state %s\n",
- blkif_status_name[status->status],
- blkif_state_name[blkif_pt_state]);
-}
-
-static void blkif_ptbe_status(
- blkif_fe_interface_status_t *status)
-{
- if ( status->handle != 0 )
- {
- DPRINTK("Status change on unsupported blkif %d\n",
- status->handle);
- return;
- }
-
- DPRINTK("ptbe_status: got %s\n", blkif_status_name[status->status]);
-
- switch ( status->status )
- {
- case BLKIF_INTERFACE_STATUS_CLOSED:
- switch ( blkif_pt_state )
- {
- case BLKIF_STATE_CLOSED:
- unexpected(status);
- break;
- case BLKIF_STATE_DISCONNECTED:
- case BLKIF_STATE_CONNECTED:
- unexpected(status);
- blkif_ptbe_close();
- break;
- }
- break;
-
- case BLKIF_INTERFACE_STATUS_DISCONNECTED:
- switch ( blkif_pt_state )
- {
- case BLKIF_STATE_CLOSED:
- blkif_ptbe_disconnect();
- break;
- case BLKIF_STATE_DISCONNECTED:
- case BLKIF_STATE_CONNECTED:
- printk(KERN_ALERT "*** add recovery code to the tap driver. ***\n");
- unexpected(status);
- break;
- }
- break;
-
- case BLKIF_INTERFACE_STATUS_CONNECTED:
- switch ( blkif_pt_state )
- {
- case BLKIF_STATE_CLOSED:
- unexpected(status);
- blkif_ptbe_disconnect();
- blkif_ptbe_connect(status);
- break;
- case BLKIF_STATE_DISCONNECTED:
- blkif_ptbe_connect(status);
- break;
- case BLKIF_STATE_CONNECTED:
- unexpected(status);
- blkif_ptbe_connect(status);
- break;
- }
- break;
-
- case BLKIF_INTERFACE_STATUS_CHANGED:
- switch ( blkif_pt_state )
- {
- case BLKIF_STATE_CLOSED:
- case BLKIF_STATE_DISCONNECTED:
- unexpected(status);
- break;
- case BLKIF_STATE_CONNECTED:
- /* vbd_update(); */
- /* tap doesn't really get state changes... */
- unexpected(status);
- break;
- }
- break;
-
- default:
- DPRINTK("Status change to unknown value %d\n", status->status);
- break;
- }
-}
-
-/*-----[ All control messages enter here: ]-------------------------------*/
-
-void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
-{
- switch ( msg->type )
- {
- case CMSG_BLKIF_FE:
-
- switch ( msg->subtype )
- {
- case CMSG_BLKIF_FE_INTERFACE_STATUS:
- if ( msg->length != sizeof(blkif_fe_interface_status_t) )
- goto parse_error;
- blkif_ptbe_status((blkif_fe_interface_status_t *) &msg->msg[0]);
- break;
-
- default:
- goto parse_error;
- }
-
- case CMSG_BLKIF_BE:
-
- switch ( msg->subtype )
- {
- case CMSG_BLKIF_BE_CREATE:
- if ( msg->length != sizeof(blkif_be_create_t) )
- goto parse_error;
- blkif_ptfe_create((blkif_be_create_t *)&msg->msg[0]);
- break;
- case CMSG_BLKIF_BE_DESTROY:
- if ( msg->length != sizeof(blkif_be_destroy_t) )
- goto parse_error;
- blkif_ptfe_destroy((blkif_be_destroy_t *)&msg->msg[0]);
- break;
- case CMSG_BLKIF_BE_CONNECT:
- if ( msg->length != sizeof(blkif_be_connect_t) )
- goto parse_error;
- blkif_ptfe_connect((blkif_be_connect_t *)&msg->msg[0]);
- break;
- case CMSG_BLKIF_BE_DISCONNECT:
- if ( msg->length != sizeof(blkif_be_disconnect_t) )
- goto parse_error;
- blkif_ptfe_disconnect((blkif_be_disconnect_t *)&msg->msg[0]);
- break;
-
- /* We just ignore anything to do with vbds for now. */
-
- case CMSG_BLKIF_BE_VBD_CREATE:
- DPRINTK("PT got VBD_CREATE\n");
- ((blkif_be_vbd_create_t *)&msg->msg[0])->status
- = BLKIF_BE_STATUS_OKAY;
- break;
- case CMSG_BLKIF_BE_VBD_DESTROY:
- DPRINTK("PT got VBD_DESTROY\n");
- ((blkif_be_vbd_destroy_t *)&msg->msg[0])->status
- = BLKIF_BE_STATUS_OKAY;
- break;
- case CMSG_BLKIF_BE_VBD_GROW:
- DPRINTK("PT got VBD_GROW\n");
- ((blkif_be_vbd_grow_t *)&msg->msg[0])->status
- = BLKIF_BE_STATUS_OKAY;
- break;
- case CMSG_BLKIF_BE_VBD_SHRINK:
- DPRINTK("PT got VBD_SHRINK\n");
- ((blkif_be_vbd_shrink_t *)&msg->msg[0])->status
- = BLKIF_BE_STATUS_OKAY;
- break;
- default:
- goto parse_error;
- }
- }
-
- ctrl_if_send_response(msg);
- return;
-
- parse_error:
- msg->length = 0;
- ctrl_if_send_response(msg);
-}
+++ /dev/null
-/******************************************************************************
- * blktap_datapath.c
- *
- * XenLinux virtual block-device tap.
- * Block request routing data path.
- *
- * Copyright (c) 2004, Andrew Warfield
- *
- */
-
-#include "blktap.h"
-
-/*-----[ The data paths ]-------------------------------------------------*/
-
-/* Connections to the frontend domains.*/
-blkif_t ptfe_blkif;
-
-/* Connection to a single backend domain. */
-blkif_ring_t *blk_ptbe_ring; /* Ring from the PT to the BE dom */
-BLKIF_RING_IDX ptbe_resp_cons; /* Response consumer for comms ring. */
-BLKIF_RING_IDX ptbe_req_prod; /* Private request producer. */
-
-/* Rings up to user space. */
-blkif_req_ring_t fe_ring;// = BLKIF_REQ_RING_INIT;
-blkif_rsp_ring_t be_ring;// = BLKIF_RSP_RING_INIT;
-
-/*-----[ Ring helpers ]---------------------------------------------------*/
-
-inline int BLKTAP_RING_FULL(blkif_generic_ring_t *ring)
-{
- if (ring->type == BLKIF_REQ_RING_TYPE) {
- blkif_req_ring_t *r = (blkif_req_ring_t *)ring;
- return ( ( r->req_prod - r->rsp_cons ) == BLKIF_RING_SIZE );
- }
-
- /* for now assume that there is always room in the response path. */
- return 0;
-}
-
-/*-----[ Tracking active requests ]---------------------------------------*/
-
-/* this must be the same as MAX_PENDING_REQS in blkback.c */
-#define MAX_ACTIVE_REQS 64
-
-active_req_t active_reqs[MAX_ACTIVE_REQS];
-unsigned char active_req_ring[MAX_ACTIVE_REQS];
-spinlock_t active_req_lock = SPIN_LOCK_UNLOCKED;
-typedef unsigned int ACTIVE_RING_IDX;
-ACTIVE_RING_IDX active_prod, active_cons;
-#define MASK_ACTIVE_IDX(_i) ((_i)&(MAX_ACTIVE_REQS-1))
-#define ACTIVE_IDX(_ar) (_ar - active_reqs)
-
-inline active_req_t *get_active_req(void)
-{
- ASSERT(active_cons != active_prod);
- return &active_reqs[MASK_ACTIVE_IDX(active_cons++)];
-}
-
-inline void free_active_req(active_req_t *ar)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&active_req_lock, flags);
- active_req_ring[MASK_ACTIVE_IDX(active_prod++)] = ACTIVE_IDX(ar);
- spin_unlock_irqrestore(&active_req_lock, flags);
-}
-
-inline void active_reqs_init(void)
-{
- ACTIVE_RING_IDX i;
-
- active_cons = 0;
- active_prod = MAX_ACTIVE_REQS;
- memset(active_reqs, 0, sizeof(active_reqs));
- for ( i = 0; i < MAX_ACTIVE_REQS; i++ )
- active_req_ring[i] = i;
-}
-
-/*-----[ Data to/from Frontend (client) VMs ]-----------------------------*/
-
-irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)
-{
- /* we have pending messages from the real frontend. */
-
- blkif_request_t *req_s, *req_d;
- BLKIF_RING_IDX fe_rp;
- unsigned long flags;
- int notify;
- unsigned long i;
- active_req_t *ar;
-
- DPRINTK("PT got FE interrupt.\n");
-
- /* lock both rings */
- spin_lock_irqsave(&blkif_io_lock, flags);
-
- /* While there are REQUESTS on FERing: */
- fe_rp = ptfe_blkif.blk_ring_base->req_prod;
- rmb();
- notify = (ptfe_blkif.blk_req_cons != fe_rp);
-
- for (i = ptfe_blkif.blk_req_cons; i != fe_rp; i++) {
-
- /* Get the next request */
- req_s = &ptfe_blkif.blk_ring_base->ring[MASK_BLKIF_IDX(i)].req;
-
- /* This is a new request:
- * Assign an active request record, and remap the id.
- */
- ar = get_active_req();
- ar->id = req_s->id;
- req_s->id = ACTIVE_IDX(ar);
- DPRINTK("%3lu < %3lu\n", req_s->id, ar->id);
-
- /* FE -> BE interposition point is here. */
-
- /* ------------------------------------------------------------- */
- /* BLKIF_OP_PROBE_HACK: */
- /* Until we have grant tables, we need to allow the backent to */
- /* map pages that are either from this domain, or more commonly */
- /* from the real front end. We achieve this in a terrible way, */
- /* by passing the front end's domid allong with PROBE messages */
- /* Once grant tables appear, this should all go away. */
-
- if (req_s->operation == BLKIF_OP_PROBE) {
- DPRINTK("Adding FE domid to PROBE request.\n");
- (domid_t)(req_s->frame_and_sects[1]) = ptfe_blkif.domid;
- }
-
- /* ------------------------------------------------------------- */
-
- /* If we are in MODE_INTERCEPT_FE or MODE_COPY_FE: */
- if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
- (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
-
- /* Copy the response message to UFERing */
- /* In MODE_INTERCEPT_FE, map attached pages into the app vma */
- /* In MODE_COPY_FE_PAGES, copy attached pages into the app vma */
-
- /* XXX: mapping/copying of attached pages is still not done! */
-
- DPRINTK("req->UFERing\n");
- blktap_write_fe_ring(req_s);
-
-
- }
-
- /* If we are not in MODE_INTERCEPT_FE or MODE_INTERCEPT_BE: */
- if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
- (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) {
-
- /* be included to prevent noise from the fe when its off */
- /* copy the request message to the BERing */
-
- DPRINTK("blktap: FERing[%u] -> BERing[%u]\n",
- (unsigned)MASK_BLKIF_IDX(i),
- (unsigned)MASK_BLKIF_IDX(ptbe_req_prod));
-
- req_d = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(ptbe_req_prod)].req;
-
- memcpy(req_d, req_s, sizeof(blkif_request_t));
-
- ptbe_req_prod++;
- }
- }
-
- ptfe_blkif.blk_req_cons = i;
-
- /* If we have forwarded any responses, notify the appropriate ends. */
- if (notify) {
-
- /* we have sent stuff to the be, notify it. */
- if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
- (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) {
- wmb();
- blk_ptbe_ring->req_prod = ptbe_req_prod;
-
- notify_via_evtchn(blkif_ptbe_evtchn);
- DPRINTK(" -- and notified.\n");
- }
-
- /* we sent stuff to the app, notify it. */
- if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
- (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
-
- blktap_kick_user();
- }
- }
-
- /* unlock rings */
- spin_unlock_irqrestore(&blkif_io_lock, flags);
-
- return IRQ_HANDLED;
-}
-
-inline int write_req_to_be_ring(blkif_request_t *req)
-{
- blkif_request_t *req_d;
-
- req_d = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(ptbe_req_prod)].req;
- memcpy(req_d, req, sizeof(blkif_request_t));
- ptbe_req_prod++;
-
- return 0;
-}
-
-inline void kick_be_domain(void) {
- wmb();
- blk_ptbe_ring->req_prod = ptbe_req_prod;
- notify_via_evtchn(blkif_ptbe_evtchn);
-}
-
-/*-----[ Data to/from Backend (server) VM ]------------------------------*/
-
-
-irqreturn_t blkif_ptbe_int(int irq, void *dev_id,
- struct pt_regs *ptregs)
-{
- blkif_response_t *resp_s, *resp_d;
- BLKIF_RING_IDX be_rp;
- unsigned long flags;
- int notify;
- unsigned long i;
- active_req_t *ar;
-
- DPRINTK("PT got BE interrupt.\n");
-
- /* lock both rings */
- spin_lock_irqsave(&blkif_io_lock, flags);
-
- /* While there are RESPONSES on BERing: */
- be_rp = blk_ptbe_ring->resp_prod;
- rmb();
- notify = (ptbe_resp_cons != be_rp);
-
- for ( i = ptbe_resp_cons; i != be_rp; i++ )
- {
- /* BE -> FE interposition point is here. */
-
- /* Get the next response */
- resp_s = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(i)].resp;
-
-
- /* If we are in MODE_INTERCEPT_BE or MODE_COPY_BE: */
- if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
- (blktap_mode & BLKTAP_MODE_COPY_BE) ) {
-
- /* Copy the response message to UBERing */
- /* In MODE_INTERCEPT_BE, map attached pages into the app vma */
- /* In MODE_COPY_BE_PAGES, copy attached pages into the app vma */
-
- /* XXX: copy/map the attached page! */
-
- DPRINTK("rsp->UBERing\n");
- blktap_write_be_ring(resp_s);
-
- }
-
- /* If we are NOT in MODE_INTERCEPT_BE or MODE_INTERCEPT_FE: */
- if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
- (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) {
-
- /* (fe included to prevent random interference from the BE) */
- /* Copy the response message to FERing */
-
- DPRINTK("blktap: BERing[%u] -> FERing[%u]\n",
- (unsigned) MASK_BLKIF_IDX(i),
- (unsigned) MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod));
-
- /* remap id, and free the active req. blkif lookup goes here too.*/
- ar = &active_reqs[resp_s->id];
- DPRINTK("%3lu > %3lu\n", resp_s->id, ar->id);
- resp_s->id = ar->id;
- free_active_req(ar);
-
- resp_d = &ptfe_blkif.blk_ring_base->ring[
- MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod)].resp;
-
- memcpy(resp_d, resp_s, sizeof(blkif_response_t));
-
- ptfe_blkif.blk_resp_prod++;
-
- }
- }
-
- ptbe_resp_cons = i;
-
- /* If we have forwarded any responses, notify the apropriate domains. */
- if (notify) {
-
- /* we have sent stuff to the fe. notify it. */
- if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
- (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) {
- wmb();
- ptfe_blkif.blk_ring_base->resp_prod = ptfe_blkif.blk_resp_prod;
-
- notify_via_evtchn(ptfe_blkif.evtchn);
- DPRINTK(" -- and notified.\n");
- }
-
- /* we sent stuff to the app, notify it. */
- if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
- (blktap_mode & BLKTAP_MODE_COPY_BE) ) {
-
- blktap_kick_user();
- }
- }
-
- spin_unlock_irqrestore(&blkif_io_lock, flags);
- return IRQ_HANDLED;
-}
-
-inline int write_resp_to_fe_ring(blkif_response_t *rsp)
-{
- blkif_response_t *resp_d;
- active_req_t *ar;
-
- /* remap id, and free the active req. blkif lookup goes here too.*/
- ar = &active_reqs[rsp->id];
- DPRINTK("%3lu > %3lu\n", rsp->id, ar->id);
- rsp->id = ar->id;
- free_active_req(ar);
-
- resp_d = &ptfe_blkif.blk_ring_base->ring[
- MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod)].resp;
-
- memcpy(resp_d, rsp, sizeof(blkif_response_t));
- ptfe_blkif.blk_resp_prod++;
-
- return 0;
-}
-
-inline void kick_fe_domain(void) {
- wmb();
- ptfe_blkif.blk_ring_base->resp_prod = ptfe_blkif.blk_resp_prod;
- notify_via_evtchn(ptfe_blkif.evtchn);
-
-}
-
-static inline void flush_requests(void)
-{
- wmb(); /* Ensure that the frontend can see the requests. */
- blk_ptbe_ring->req_prod = ptbe_req_prod;
- notify_via_evtchn(blkif_ptbe_evtchn);
-}
-
-/*-----[ Data to/from user space ]----------------------------------------*/
-
-
-int blktap_write_fe_ring(blkif_request_t *req)
-{
- blkif_request_t *target;
- int error, i;
-
- /*
- * This is called to pass a request from the real frontend domain's
- * blkif ring to the character device.
- */
-
- if ( ! blktap_ring_ok ) {
- DPRINTK("blktap: fe_ring not ready for a request!\n");
- return 0;
- }
-
- if ( BLKTAP_RING_FULL(RING(&fe_ring)) ) {
- DPRINTK("blktap: fe_ring is full, can't add.\n");
- return 0;
- }
-
- target = &fe_ring.ring->ring[MASK_BLKIF_IDX(fe_ring.req_prod)].req;
- memcpy(target, req, sizeof(*req));
-
-/* maybe move this stuff out into a seperate func ------------------- */
-
- /*
- * For now, map attached page into a fixed position into the vma.
- * XXX: make this map to a free page.
- */
-
- /* Attempt to map the foreign pages directly in to the application */
- for (i=0; i<target->nr_segments; i++) {
-
- /* get an unused virtual address from the char device */
- /* store the old page address */
- /* replace the address with the virtual address */
-
- /* blktap_vma->vm_start+((2+i)*PAGE_SIZE) */
-
- error = direct_remap_area_pages(blktap_vma->vm_mm,
- MMAP_VADDR(req->id, i),
- target->frame_and_sects[0] & PAGE_MASK,
- PAGE_SIZE,
- blktap_vma->vm_page_prot,
- ptfe_blkif.domid);
- if ( error != 0 ) {
- printk(KERN_INFO "remapping attached page failed! (%d)\n", error);
- return 0;
- }
- }
- /* fix the address of the attached page in the message. */
- /* TODO: preserve the segment number stuff here... */
- /* target->frame_and_sects[0] = blktap_vma->vm_start + PAGE_SIZE;*/
-/* ------------------------------------------------------------------ */
-
-
- fe_ring.req_prod++;
-
- return 0;
-}
-
-int blktap_write_be_ring(blkif_response_t *rsp)
-{
- blkif_response_t *target;
-
- /*
- * This is called to pass a request from the real backend domain's
- * blkif ring to the character device.
- */
-
- if ( ! blktap_ring_ok ) {
- DPRINTK("blktap: be_ring not ready for a request!\n");
- return 0;
- }
-
- if ( BLKTAP_RING_FULL(RING(&be_ring)) ) {
- DPRINTK("blktap: be_ring is full, can't add.\n");
- return 0;
- }
-
- target = &be_ring.ring->ring[MASK_BLKIF_IDX(be_ring.rsp_prod)].resp;
- memcpy(target, rsp, sizeof(*rsp));
-
-
- /* XXX: map attached pages and fix-up addresses in the copied address. */
-
- be_ring.rsp_prod++;
-
- return 0;
-}
-
-int blktap_read_fe_ring(void)
-{
- /* This is called to read responses from the UFE ring. */
-
- BLKIF_RING_IDX fe_rp;
- unsigned long i;
- int notify;
-
- DPRINTK("blktap_read_fe_ring()\n");
-
- fe_rp = fe_ring.ring->resp_prod;
- rmb();
- notify = (fe_rp != fe_ring.rsp_cons);
-
- /* if we are forwarding from UFERring to FERing */
- if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
-
- /* for each outstanding message on the UFEring */
- for ( i = fe_ring.rsp_cons; i != fe_rp; i++ ) {
-
- /* XXX: remap pages on that message as necessary */
- /* copy the message to the UBEring */
-
- DPRINTK("resp->fe_ring\n");
- write_resp_to_fe_ring(&fe_ring.ring->ring[MASK_BLKIF_IDX(i)].resp);
- }
-
- fe_ring.rsp_cons = fe_rp;
-
- /* notify the fe if necessary */
- if ( notify ) {
- DPRINTK("kick_fe_domain()\n");
- kick_fe_domain();
- }
- }
-
- return 0;
-}
-
-int blktap_read_be_ring(void)
-{
- /* This is called to read responses from the UBE ring. */
-
- BLKIF_RING_IDX be_rp;
- unsigned long i;
- int notify;
-
- DPRINTK("blktap_read_be_ring()\n");
-
- be_rp = be_ring.ring->req_prod;
- rmb();
- notify = (be_rp != be_ring.req_cons);
-
- /* if we are forwarding from UFERring to FERing */
- if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) {
-
- /* for each outstanding message on the UFEring */
- for ( i = be_ring.req_cons; i != be_rp; i++ ) {
-
- /* XXX: remap pages on that message as necessary */
- /* copy the message to the UBEring */
-
- DPRINTK("req->be_ring\n");
- write_req_to_be_ring(&be_ring.ring->ring[MASK_BLKIF_IDX(i)].req);
- }
-
- be_ring.req_cons = be_rp;
-
- /* notify the fe if necessary */
- if ( notify ) {
- DPRINTK("kick_be_domain()\n");
- kick_be_domain();
- }
- }
-
- return 0;
-}
+++ /dev/null
-/******************************************************************************
- * blktap_userdev.c
- *
- * XenLinux virtual block-device tap.
- * Control interface between the driver and a character device.
- *
- * Copyright (c) 2004, Andrew Warfield
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/miscdevice.h>
-#include <linux/errno.h>
-#include <linux/major.h>
-#include <linux/gfp.h>
-#include <linux/poll.h>
-#include <asm/pgalloc.h>
-
-#include "blktap.h"
-
-
-unsigned long blktap_mode = BLKTAP_MODE_PASSTHROUGH;
-
-/* Only one process may open /dev/xen/blktap at any time. */
-static unsigned long blktap_dev_inuse;
-unsigned long blktap_ring_ok; /* make this ring->state */
-
-/* for poll: */
-static wait_queue_head_t blktap_wait;
-
-/* Where things are inside the device mapping. */
-struct vm_area_struct *blktap_vma;
-unsigned long mmap_vstart;
-unsigned long rings_vstart;
-
-/* -------[ blktap vm ops ]------------------------------------------- */
-
-static struct page *blktap_nopage(struct vm_area_struct *vma,
- unsigned long address,
- int *type)
-{
- /*
- * if the page has not been mapped in by the driver then generate
- * a SIGBUS to the domain.
- */
-
- force_sig(SIGBUS, current);
-
- return 0;
-}
-
-struct vm_operations_struct blktap_vm_ops = {
- nopage: blktap_nopage,
-};
-
-/* -------[ blktap file ops ]----------------------------------------- */
-
-static int blktap_open(struct inode *inode, struct file *filp)
-{
- if ( test_and_set_bit(0, &blktap_dev_inuse) )
- return -EBUSY;
-
- printk(KERN_ALERT "blktap open.\n");
-
- /* Allocate the fe ring. */
- fe_ring.ring = (blkif_ring_t *)get_zeroed_page(GFP_KERNEL);
- if (fe_ring.ring == NULL)
- goto fail_nomem;
-
- SetPageReserved(virt_to_page(fe_ring.ring));
-
- fe_ring.ring->req_prod = fe_ring.ring->resp_prod
- = fe_ring.req_prod
- = fe_ring.rsp_cons
- = 0;
-
- /* Allocate the be ring. */
- be_ring.ring = (blkif_ring_t *)get_zeroed_page(GFP_KERNEL);
- if (be_ring.ring == NULL)
- goto fail_free_fe;
-
- SetPageReserved(virt_to_page(be_ring.ring));
-
- be_ring.ring->req_prod = be_ring.ring->resp_prod
- = be_ring.rsp_prod
- = be_ring.req_cons
- = 0;
-
- DPRINTK(KERN_ALERT "blktap open.\n");
-
- return 0;
-
- fail_free_fe:
- free_page( (unsigned long) fe_ring.ring);
-
- fail_nomem:
- return -ENOMEM;
-}
-
-static int blktap_release(struct inode *inode, struct file *filp)
-{
- blktap_dev_inuse = 0;
- blktap_ring_ok = 0;
-
- printk(KERN_ALERT "blktap closed.\n");
-
- /* Free the ring page. */
- ClearPageReserved(virt_to_page(fe_ring.ring));
- free_page((unsigned long) fe_ring.ring);
-
- ClearPageReserved(virt_to_page(be_ring.ring));
- free_page((unsigned long) be_ring.ring);
-
- return 0;
-}
-
-static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
-{
- int size;
-
- printk(KERN_ALERT "blktap mmap (%lx, %lx)\n",
- vma->vm_start, vma->vm_end);
-
- vma->vm_ops = &blktap_vm_ops;
-
- size = vma->vm_end - vma->vm_start;
- if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) {
- printk(KERN_INFO
- "blktap: you _must_ map exactly %d pages!\n",
- MMAP_PAGES + RING_PAGES);
- return -EAGAIN;
- }
-
- size >>= PAGE_SHIFT;
- printk(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
-
- rings_vstart = vma->vm_start;
- mmap_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT);
-
- /* Map the ring pages to the start of the region and reserve it. */
-
- /* not sure if I really need to do this... */
- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
- DPRINTK("Mapping be_ring page %lx.\n", __pa(be_ring.ring));
- if (remap_page_range(vma, vma->vm_start, __pa(be_ring.ring), PAGE_SIZE,
- vma->vm_page_prot)) {
- printk(KERN_ERR "be_ring: remap_page_range failure!\n");
- }
-
- DPRINTK("Mapping fe_ring page %lx.\n", __pa(fe_ring.ring));
- if (remap_page_range(vma, vma->vm_start + PAGE_SIZE, __pa(fe_ring.ring),
- PAGE_SIZE, vma->vm_page_prot)) {
- printk(KERN_ERR "fe_ring: remap_page_range failure!\n");
- }
-
- blktap_vma = vma;
- blktap_ring_ok = 1;
-
- return 0;
-}
-
-static int blktap_ioctl(struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg)
-{
- switch(cmd) {
- case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
- return blktap_read_fe_ring();
-
- case BLKTAP_IOCTL_KICK_BE: /* There are be messages to process. */
- return blktap_read_be_ring();
-
- case BLKTAP_IOCTL_SETMODE:
- if (BLKTAP_MODE_VALID(arg)) {
- blktap_mode = arg;
- /* XXX: may need to flush rings here. */
- printk(KERN_INFO "blktap: set mode to %lx\n", arg);
- return 0;
- }
- /* XXX: return a more meaningful error case here. */
- }
- return -ENOIOCTLCMD;
-}
-
-static unsigned int blktap_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &blktap_wait, wait);
-
- if ( (fe_ring.req_prod != fe_ring.ring->req_prod) ||
- (be_ring.rsp_prod != be_ring.ring->resp_prod) ) {
-
- fe_ring.ring->req_prod = fe_ring.req_prod;
- be_ring.ring->resp_prod = be_ring.rsp_prod;
- return POLLIN | POLLRDNORM;
- }
-
- return 0;
-}
-
-void blktap_kick_user(void)
-{
- /* blktap_ring->req_prod = blktap_req_prod; */
- wake_up_interruptible(&blktap_wait);
-}
-
-static struct file_operations blktap_fops = {
- owner: THIS_MODULE,
- poll: blktap_poll,
- ioctl: blktap_ioctl,
- open: blktap_open,
- release: blktap_release,
- mmap: blktap_mmap,
-};
-
-/* -------[ blktap module setup ]------------------------------------- */
-
-static struct miscdevice blktap_miscdev = {
- .minor = BLKTAP_MINOR,
- .name = "blktap",
- .fops = &blktap_fops,
- .devfs_name = "misc/blktap",
-};
-
-int blktap_init(void)
-{
- int err;
-
- err = misc_register(&blktap_miscdev);
- if ( err != 0 )
- {
- printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err);
- return err;
- }
-
- init_waitqueue_head(&blktap_wait);
-
-
- return 0;
-}
+++ /dev/null
-#ifndef __ASM_HARDIRQ_H
-#define __ASM_HARDIRQ_H
-
-#include <linux/config.h>
-#include <linux/threads.h>
-#include <linux/irq.h>
-
-typedef struct {
- unsigned int __softirq_pending;
- unsigned long idle_timestamp;
- unsigned int __nmi_count; /* arch dependent */
- unsigned int apic_timer_irqs; /* arch dependent */
-} ____cacheline_aligned irq_cpustat_t;
-
-#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */
-
-/*
- * We put the hardirq and softirq counter into the preemption
- * counter. The bitmask has the following meaning:
- *
- * - bits 0-7 are the preemption count (max preemption depth: 256)
- * - bits 8-15 are the softirq count (max # of softirqs: 256)
- * - bits 16-24 are the hardirq count (max # of hardirqs: 512)
- *
- * - ( bit 26 is the PREEMPT_ACTIVE flag. )
- *
- * PREEMPT_MASK: 0x000000ff
- * SOFTIRQ_MASK: 0x0000ff00
- * HARDIRQ_MASK: 0x01ff0000
- */
-
-#define PREEMPT_BITS 8
-#define SOFTIRQ_BITS 8
-#define HARDIRQ_BITS 9
-
-#define PREEMPT_SHIFT 0
-#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-
-/*
- * The hardirq mask has to be large enough to have
- * space for potentially all IRQ sources in the system
- * nesting on a single CPU:
- */
-#if (1 << HARDIRQ_BITS) < NR_IRQS
-# error HARDIRQ_BITS is too low!
-#endif
-
-#define nmi_enter() (irq_enter())
-#define nmi_exit() (preempt_count() -= HARDIRQ_OFFSET)
-
-#define irq_enter() (preempt_count() += HARDIRQ_OFFSET)
-#define irq_exit() \
-do { \
- preempt_count() -= IRQ_EXIT_OFFSET; \
- if (!in_interrupt() && softirq_pending(smp_processor_id())) \
- do_softirq(); \
- preempt_enable_no_resched(); \
-} while (0)
-
-#endif /* __ASM_HARDIRQ_H */
+++ /dev/null
-/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws
- * which needs to alter them. */
-
-static inline void smpboot_clear_io_apic_irqs(void)
-{
-#if 1
- printk("smpboot_clear_io_apic_irqs\n");
-#else
- io_apic_irqs = 0;
-#endif
-}
-
-static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
-{
-#if 1
- printk("smpboot_setup_warm_reset_vector\n");
-#else
- CMOS_WRITE(0xa, 0xf);
- local_flush_tlb();
- Dprintk("1.\n");
- *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
- Dprintk("2.\n");
- *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
- Dprintk("3.\n");
-#endif
-}
-
-static inline void smpboot_restore_warm_reset_vector(void)
-{
- /*
- * Install writable page 0 entry to set BIOS data area.
- */
- local_flush_tlb();
-
- /*
- * Paranoid: Set warm reset code and vector here back
- * to default values.
- */
- CMOS_WRITE(0, 0xf);
-
- *((volatile long *) phys_to_virt(0x467)) = 0;
-}
-
-static inline void smpboot_setup_io_apic(void)
-{
-#if 1
- printk("smpboot_setup_io_apic\n");
-#else
- /*
- * Here we can be sure that there is an IO-APIC in the system. Let's
- * go and set it up:
- */
- if (!skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
-#endif
-}
-
-
-#define smp_found_config (HYPERVISOR_shared_info->n_vcpu > 1)
+++ /dev/null
-#ifndef __ASM_SPINLOCK_H
-#define __ASM_SPINLOCK_H
-
-#include <asm/atomic.h>
-#include <asm/rwlock.h>
-#include <asm/page.h>
-#include <linux/config.h>
-#include <linux/compiler.h>
-
-asmlinkage int printk(const char * fmt, ...)
- __attribute__ ((format (printf, 1, 2)));
-
-/*
- * Your basic SMP spinlocks, allowing only a single CPU anywhere
- */
-
-typedef struct {
- volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
- unsigned magic;
-#endif
-} spinlock_t;
-
-#define SPINLOCK_MAGIC 0xdead4ead
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC
-#else
-#define SPINLOCK_MAGIC_INIT /* */
-#endif
-
-#define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT }
-
-#define spin_lock_init(x) do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-
-/*
- * Simple spin lock operations. There are two variants, one clears IRQ's
- * on the local processor, one does not.
- *
- * We make no fairness assumptions. They have a cost.
- */
-
-#define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0)
-#define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x))
-
-#define spin_lock_string \
- "\n1:\t" \
- "lock ; decb %0\n\t" \
- "jns 3f\n" \
- "2:\t" \
- "rep;nop\n\t" \
- "cmpb $0,%0\n\t" \
- "jle 2b\n\t" \
- "jmp 1b\n" \
- "3:\n\t"
-
-#define spin_lock_string_flags \
- "\n1:\t" \
- "lock ; decb %0\n\t" \
- "jns 4f\n\t" \
- "2:\t" \
- "testl $0x200, %1\n\t" \
- "jz 3f\n\t" \
- "#sti\n\t" \
- "3:\t" \
- "rep;nop\n\t" \
- "cmpb $0, %0\n\t" \
- "jle 3b\n\t" \
- "#cli\n\t" \
- "jmp 1b\n" \
- "4:\n\t"
-
-/*
- * This works. Despite all the confusion.
- * (except on PPro SMP or if we are using OOSTORE)
- * (PPro errata 66, 92)
- */
-
-#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
-
-#define spin_unlock_string \
- "movb $1,%0" \
- :"=m" (lock->lock) : : "memory"
-
-
-static inline void _raw_spin_unlock(spinlock_t *lock)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
- BUG_ON(lock->magic != SPINLOCK_MAGIC);
- BUG_ON(!spin_is_locked(lock));
-#endif
- __asm__ __volatile__(
- spin_unlock_string
- );
-}
-
-#else
-
-#define spin_unlock_string \
- "xchgb %b0, %1" \
- :"=q" (oldval), "=m" (lock->lock) \
- :"0" (oldval) : "memory"
-
-static inline void _raw_spin_unlock(spinlock_t *lock)
-{
- char oldval = 1;
-#ifdef CONFIG_DEBUG_SPINLOCK
- BUG_ON(lock->magic != SPINLOCK_MAGIC);
- BUG_ON(!spin_is_locked(lock));
-#endif
- __asm__ __volatile__(
- spin_unlock_string
- );
-}
-
-#endif
-
-static inline int _raw_spin_trylock(spinlock_t *lock)
-{
- char oldval;
- __asm__ __volatile__(
- "xchgb %b0,%1"
- :"=q" (oldval), "=m" (lock->lock)
- :"0" (0) : "memory");
- return oldval > 0;
-}
-
-static inline void _raw_spin_lock(spinlock_t *lock)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
- if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
- printk("eip: %p\n", __builtin_return_address(0));
- BUG();
- }
-#endif
- __asm__ __volatile__(
- spin_lock_string
- :"=m" (lock->lock) : : "memory");
-}
-
-static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
- if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
- printk("eip: %p\n", __builtin_return_address(0));
- BUG();
- }
-#endif
- __asm__ __volatile__(
- spin_lock_string_flags
- :"=m" (lock->lock) : "r" (flags) : "memory");
-}
-
-/*
- * Read-write spinlocks, allowing multiple readers
- * but only one writer.
- *
- * NOTE! it is quite common to have readers in interrupts
- * but no interrupt writers. For those circumstances we
- * can "mix" irq-safe locks - any writer needs to get a
- * irq-safe write-lock, but readers can get non-irqsafe
- * read-locks.
- */
-typedef struct {
- volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
- unsigned magic;
-#endif
-} rwlock_t;
-
-#define RWLOCK_MAGIC 0xdeaf1eed
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define RWLOCK_MAGIC_INIT , RWLOCK_MAGIC
-#else
-#define RWLOCK_MAGIC_INIT /* */
-#endif
-
-#define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
-
-#define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0)
-
-#define rwlock_is_locked(x) ((x)->lock != RW_LOCK_BIAS)
-
-/*
- * On x86, we implement read-write locks as a 32-bit counter
- * with the high bit (sign) being the "contended" bit.
- *
- * The inline assembly is non-obvious. Think about it.
- *
- * Changed to use the same technique as rw semaphores. See
- * semaphore.h for details. -ben
- */
-/* the spinlock helpers are in arch/i386/kernel/semaphore.c */
-
-static inline void _raw_read_lock(rwlock_t *rw)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
- BUG_ON(rw->magic != RWLOCK_MAGIC);
-#endif
- __build_read_lock(rw, "__read_lock_failed");
-}
-
-static inline void _raw_write_lock(rwlock_t *rw)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
- BUG_ON(rw->magic != RWLOCK_MAGIC);
-#endif
- __build_write_lock(rw, "__write_lock_failed");
-}
-
-#define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
-#define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
-
-static inline int _raw_write_trylock(rwlock_t *lock)
-{
- atomic_t *count = (atomic_t *)lock;
- if (atomic_sub_and_test(RW_LOCK_BIAS, count))
- return 1;
- atomic_add(RW_LOCK_BIAS, count);
- return 0;
-}
-
-#endif /* __ASM_SPINLOCK_H */